Update aom to v1.0.0

Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
author: trav90 <travawine@palemoon.org> 2018-10-19 21:52:15 -0500
committer: trav90 <travawine@palemoon.org> 2018-10-19 21:52:20 -0500
commit: bbcc64772580c8a979288791afa02d30bc476d2e (patch)
tree: 437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/aom_dsp
parent: 14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff)
download: UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip
281 files changed, 21362 insertions, 63320 deletions
diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c
index 389cf2049..bfb3e7e00 100644
--- a/third_party/aom/aom_dsp/add_noise.c
+++ b/third_party/aom/aom_dsp/add_noise.c
@@ -12,8 +12,8 @@
 #include <math.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/ans.h b/third_party/aom/aom_dsp/ans.h
deleted file mode 100644
index a7a2f0eab..000000000
--- a/third_party/aom/aom_dsp/ans.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANS_H_
-#define AOM_DSP_ANS_H_
-// Constants, types and utilities for Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// Use windowed ANS, size is passed in at initialization
-#define ANS_MAX_SYMBOLS 1
-#define ANS_REVERSE 1
-
-typedef uint8_t AnsP8;
-#define ANS_P8_PRECISION 256u
-#define ANS_P8_SHIFT 8
-#define RANS_PROB_BITS 15
-#define RANS_PRECISION (1u << RANS_PROB_BITS)
-
-// L_BASE is the ANS base state. L_BASE % PRECISION must be 0.
-#define L_BASE (1u << 17)
-#define IO_BASE 256
-// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANS_H_
diff --git a/third_party/aom/aom_dsp/ansreader.h b/third_party/aom/aom_dsp/ansreader.h
deleted file mode 100644
index e50c63b2d..000000000
--- a/third_party/aom/aom_dsp/ansreader.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANSREADER_H_
-#define AOM_DSP_ANSREADER_H_
-// An implementation of Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-// Implements decoding of:
-// * rABS (range Asymmetric Binary Systems), a boolean coder
-// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-#include "aom_dsp/ans.h"
-#include "aom_ports/mem_ops.h"
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-struct AnsDecoder {
-  const uint8_t *buf;
-  int buf_offset;
-  uint32_t state;
-#if ANS_MAX_SYMBOLS
-  int symbols_left;
-  int window_size;
-#endif
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-};
-
-static INLINE int ans_read_reinit(struct AnsDecoder *const ans);
-
-static INLINE unsigned refill_state(struct AnsDecoder *const ans,
-                                    unsigned state) {
-#if ANS_REVERSE
-  while (state < L_BASE && ans->buf_offset < 0) {
-    state = state * IO_BASE + ans->buf[ans->buf_offset++];
-  }
-#else
-  while (state < L_BASE && ans->buf_offset > 0) {
-    state = state * IO_BASE + ans->buf[--ans->buf_offset];
-  }
-#endif
-  return state;
-}
-
-// Decode one rABS encoded boolean where the probability of the value being zero
-// is p0.
-static INLINE int rabs_read(struct AnsDecoder *ans, AnsP8 p0) {
-#if ANS_MAX_SYMBOLS
-  if (ans->symbols_left-- == 0) {
-    ans_read_reinit(ans);
-    ans->symbols_left--;
-  }
-#endif
-  unsigned state = refill_state(ans, ans->state);
-  const unsigned quotient = state / ANS_P8_PRECISION;
-  const unsigned remainder = state % ANS_P8_PRECISION;
-  const int value = remainder >= p0;
-  const unsigned qp0 = quotient * p0;
-  if (value)
-    state = state - qp0 - p0;
-  else
-    state = qp0 + remainder;
-  ans->state = state;
-  return value;
-}
-
-// Decode one rABS encoded boolean where the probability of the value being zero
-// is one half.
-static INLINE int rabs_read_bit(struct AnsDecoder *ans) {
-#if ANS_MAX_SYMBOLS
-  if (ans->symbols_left-- == 0) {
-    ans_read_reinit(ans);
-    ans->symbols_left--;
-  }
-#endif
-  unsigned state = refill_state(ans, ans->state);
-  const int value = !!(state & 0x80);
-  ans->state = ((state >> 1) & ~0x7F) | (state & 0x7F);
-  return value;
-}
-
-struct rans_dec_sym {
-  uint8_t val;
-  aom_cdf_prob prob;
-  aom_cdf_prob cum_prob;  // not-inclusive
-};
-
-static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
-                             aom_cdf_prob rem) {
-  int i;
-  aom_cdf_prob cum_prob = 0, top_prob;
-  // TODO(skal): if critical, could be a binary search.
-  // Or, better, an O(1) alias-table.
-  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
-    cum_prob = top_prob;
-  }
-  out->val = i;
-  out->prob = top_prob - cum_prob;
-  out->cum_prob = cum_prob;
-}
-
-static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
-  unsigned rem;
-  unsigned quo;
-  struct rans_dec_sym sym;
-#if ANS_MAX_SYMBOLS
-  if (ans->symbols_left-- == 0) {
-    ans_read_reinit(ans);
-    ans->symbols_left--;
-  }
-#endif
-  ans->state = refill_state(ans, ans->state);
-  quo = ans->state / RANS_PRECISION;
-  rem = ans->state % RANS_PRECISION;
-  fetch_sym(&sym, tab, rem);
-  ans->state = quo * sym.prob + rem - sym.cum_prob;
-  return sym.val;
-}
-
-static INLINE int ans_read_init(struct AnsDecoder *const ans,
-                                const uint8_t *const buf, int offset) {
-  unsigned x;
-  if (offset < 1) return 1;
-#if ANS_REVERSE
-  ans->buf = buf + offset;
-  ans->buf_offset = -offset;
-  x = buf[0];
-  if ((x & 0x80) == 0) {  // Marker is 0xxx xxxx
-    if (offset < 2) return 1;
-    ans->buf_offset += 2;
-    ans->state = mem_get_be16(buf) & 0x7FFF;
-#if L_BASE * IO_BASE > (1 << 23)
-  } else if ((x & 0xC0) == 0x80) {  // Marker is 10xx xxxx
-    if (offset < 3) return 1;
-    ans->buf_offset += 3;
-    ans->state = mem_get_be24(buf) & 0x3FFFFF;
-  } else {  // Marker is 11xx xxxx
-    if (offset < 4) return 1;
-    ans->buf_offset += 4;
-    ans->state = mem_get_be32(buf) & 0x3FFFFFFF;
-#else
-  } else {  // Marker is 1xxx xxxx
-    if (offset < 3) return 1;
-    ans->buf_offset += 3;
-    ans->state = mem_get_be24(buf) & 0x7FFFFF;
-#endif
-  }
-#else
-  ans->buf = buf;
-  x = buf[offset - 1];
-  if ((x & 0x80) == 0) {  // Marker is 0xxx xxxx
-    if (offset < 2) return 1;
-    ans->buf_offset = offset - 2;
-    ans->state = mem_get_le16(buf + offset - 2) & 0x7FFF;
-  } else if ((x & 0xC0) == 0x80) {  // Marker is 10xx xxxx
-    if (offset < 3) return 1;
-    ans->buf_offset = offset - 3;
-    ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
-  } else if ((x & 0xE0) == 0xE0) {  // Marker is 111x xxxx
-    if (offset < 4) return 1;
-    ans->buf_offset = offset - 4;
-    ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
-  } else {
-    // Marker 110x xxxx implies this byte is a superframe marker
-    return 1;
-  }
-#endif  // ANS_REVERSE
-#if CONFIG_ACCOUNTING
-  ans->accounting = NULL;
-#endif
-  ans->state += L_BASE;
-  if (ans->state >= L_BASE * IO_BASE) return 1;
-#if ANS_MAX_SYMBOLS
-  assert(ans->window_size > 1);
-  ans->symbols_left = ans->window_size;
-#endif
-  return 0;
-}
-
-#if ANS_REVERSE
-static INLINE int ans_read_reinit(struct AnsDecoder *const ans) {
-  return ans_read_init(ans, ans->buf + ans->buf_offset, -ans->buf_offset);
-}
-#endif
-
-static INLINE int ans_read_end(const struct AnsDecoder *const ans) {
-  return ans->buf_offset == 0 && ans->state < L_BASE;
-}
-
-static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
-  return ans->state < L_BASE / RANS_PRECISION;
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANSREADER_H_
diff --git a/third_party/aom/aom_dsp/answriter.h b/third_party/aom/aom_dsp/answriter.h
deleted file mode 100644
index 353acf1a9..000000000
--- a/third_party/aom/aom_dsp/answriter.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANSWRITER_H_
-#define AOM_DSP_ANSWRITER_H_
-// An implementation of Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-// Implements encoding of:
-// * rABS (range Asymmetric Binary Systems), a boolean coder
-// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/prob.h"
-#include "aom_ports/mem_ops.h"
-#include "av1/common/odintrin.h"
-
-#if RANS_PRECISION <= OD_DIVU_DMAX
-#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
-  do {                                                     \
-    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
-    remainder = (dividend) - (quotient) * (divisor);       \
-  } while (0)
-#else
-#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
-  do {                                                     \
-    quotient = (dividend) / (divisor);                     \
-    remainder = (dividend) % (divisor);                    \
-  } while (0)
-#endif
-
-#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-struct AnsCoder {
-  uint8_t *buf;
-  int buf_offset;
-  uint32_t state;
-};
-
-static INLINE void ans_write_init(struct AnsCoder *const ans,
-                                  uint8_t *const buf) {
-  ans->buf = buf;
-  ans->buf_offset = 0;
-  ans->state = L_BASE;
-}
-
-static INLINE int ans_write_end(struct AnsCoder *const ans) {
-  uint32_t state;
-  int ans_size;
-  assert(ans->state >= L_BASE);
-  assert(ans->state < L_BASE * IO_BASE);
-  state = ans->state - L_BASE;
-  if (state < (1u << 15)) {
-    mem_put_le16(ans->buf + ans->buf_offset, (0x00u << 15) + state);
-    ans_size = ans->buf_offset + 2;
-#if ANS_REVERSE
-#if L_BASE * IO_BASE > (1 << 23)
-  } else if (state < (1u << 22)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state);
-    ans_size = ans->buf_offset + 3;
-  } else if (state < (1u << 30)) {
-    mem_put_le32(ans->buf + ans->buf_offset, (0x03u << 30) + state);
-    ans_size = ans->buf_offset + 4;
-#else
-  } else if (state < (1u << 23)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x01u << 23) + state);
-    ans_size = ans->buf_offset + 3;
-#endif
-#else
-  } else if (state < (1u << 22)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state);
-    ans_size = ans->buf_offset + 3;
-  } else if (state < (1u << 29)) {
-    mem_put_le32(ans->buf + ans->buf_offset, (0x07u << 29) + state);
-    ans_size = ans->buf_offset + 4;
-#endif
-  } else {
-    assert(0 && "State is too large to be serialized");
-    return ans->buf_offset;
-  }
-#if ANS_REVERSE
-  {
-    int i;
-    uint8_t tmp;
-    for (i = 0; i < (ans_size >> 1); i++) {
-      tmp = ans->buf[i];
-      ans->buf[i] = ans->buf[ans_size - 1 - i];
-      ans->buf[ans_size - 1 - i] = tmp;
-    }
-    ans->buf += ans_size;
-    ans->buf_offset = 0;
-    ans->state = L_BASE;
-  }
-#endif
-  return ans_size;
-}
-
-// Write one boolean using rABS where p0 is the probability of the value being
-// zero.
-static INLINE void rabs_write(struct AnsCoder *ans, int value, AnsP8 p0) {
-  const AnsP8 p = ANS_P8_PRECISION - p0;
-  const unsigned l_s = value ? p : p0;
-  unsigned state = ans->state;
-  while (state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
-    ans->buf[ans->buf_offset++] = state % IO_BASE;
-    state /= IO_BASE;
-  }
-  const unsigned quotient = ANS_DIV8(state, l_s);
-  const unsigned remainder = state - quotient * l_s;
-  ans->state = quotient * ANS_P8_PRECISION + remainder + (value ? p0 : 0);
-}
-
-// Encode one symbol using rANS.
-// cum_prob: The cumulative probability before this symbol (the offset of
-// the symbol in the symbol cycle)
-// prob: The probability of this symbol (l_s from the paper)
-// RANS_PRECISION takes the place of m from the paper.
-static INLINE void rans_write(struct AnsCoder *ans, aom_cdf_prob cum_prob,
-                              aom_cdf_prob prob) {
-  unsigned quotient, remainder;
-  while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * prob) {
-    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
-    ans->state /= IO_BASE;
-  }
-  ANS_DIVREM(quotient, remainder, ans->state, prob);
-  ans->state = quotient * RANS_PRECISION + remainder + cum_prob;
-}
-
-#undef ANS_DIV8
-#undef ANS_DIVREM
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANSWRITER_H_
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
index c903ea52d..bba37e227 100644
--- a/third_party/aom/aom_dsp/aom_convolve.c
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -12,73 +12,40 @@
 #include <assert.h>
 #include <string.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+                                      const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const InterpKernel *x_filters, int x0_q4,
                            int x_step_q4, int w, int h) {
-  int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
+  for (int y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
+    for (int x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *x_filters, int x0_qn,
-                                   int x_step_qn, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *const x_filter = x_filters[x_filter_idx];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      const int sum = horz_scalar_product(src_x, x_filter);
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *x_filters, int x0_q4,
-                               int x_step_q4, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -86,97 +53,19 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_qn,
-                                       int x_step_qn, int w, int h) {
-  int x, y;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *const x_filter = x_filters[x_filter_idx];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const InterpKernel *y_filters, int y0_q4,
                           int y_step_q4, int w, int h) {
-  int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
+    for (int y = 0; y < h; ++y) {
       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *y_filters, int y0_qn,
-                                  int y_step_qn, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_qn = y0_qn;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter =
-          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
+      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *y_filters, int y0_q4,
-                              int y_step_q4, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
-          1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -184,103 +73,6 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_qn,
-                                      int y_step_qn, int w, int h) {
-  int x, y;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_qn = y0_qn;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter =
-          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
-          1);
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters, int y0_q4,
-                     int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
-                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                 intermediate_height);
-  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
-                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-}
-
-static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_qn,
-                             int x_step_qn, const InterpKernel *const y_filters,
-                             int y0_qn, int y_step_qn, int w, int h) {
-  // TODO(afergs): Update comment here
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
-  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
-
-  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
-                         intermediate_height);
-  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
-}
-
 static const InterpKernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
   // TODO(agrange) Modify to make independent of table alignment.
@@ -306,52 +98,6 @@ void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                  w, h);
 }
 
-void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int subpel_x,
-                                 int x_step_qn, const int16_t *filter_y,
-                                 int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  (void)subpel_y;
-  (void)filter_y;
-  (void)y_step_qn;
-
-  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
-                         x_step_qn, w, h);
-}
-
-void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
-}
-
-void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int subpel_x,
-                                     int x_step_qn, const int16_t *filter_y,
-                                     int subpel_y, int y_step_qn, int w,
-                                     int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  (void)subpel_y;
-  (void)filter_y;
-  (void)y_step_qn;
-
-  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
-                             subpel_x, x_step_qn, w, h);
-}
-
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
@@ -367,109 +113,6 @@ void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                 w, h);
 }
 
-void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int subpel_x,
-                                int x_step_qn, const int16_t *filter_y,
-                                int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  (void)subpel_x;
-  (void)filter_x;
-  (void)x_step_qn;
-
-  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
-                        y_step_qn, w, h);
-}
-
-void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
-}
-
-void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int subpel_x,
-                                    int x_step_qn, const int16_t *filter_y,
-                                    int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  (void)subpel_x;
-  (void)filter_x;
-  (void)x_step_qn;
-
-  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
-                            subpel_y, y_step_qn, w, h);
-}
-
-void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int subpel_x, int x_step_qn,
-                           const int16_t *filter_y, int subpel_y, int y_step_qn,
-                           int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
-                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
-}
-
-void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
-  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
-                     h);
-}
-
-void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int subpel_x,
-                               int x_step_qn, const int16_t *filter_y,
-                               int subpel_y, int y_step_qn, int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
-                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
-  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
-                     h);
-}
-
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter_x,
                          int filter_x_stride, const int16_t *filter_y,
@@ -488,330 +131,34 @@ void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   }
 }
 
-void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int filter_x_stride, const int16_t *filter_y,
-                        int filter_y_stride, int w, int h) {
-  int x, y;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                       int w, int h) {
-  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
-}
-
-// TODO(afergs): Make sure this works too
-#if CONFIG_LOOP_RESTORATION
-static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *x_filters, int x0_q4,
-                                   int x_step_q4, int w, int h) {
-  int x, y, k;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                          src_x[SUBPEL_TAPS / 2 - 1]);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *y_filters, int y0_q4,
-                                  int y_step_q4, int w, int h) {
-  int x, y, k;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                         intermediate_height);
-  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                         x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                        y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_q4,
-                                       int x_step_q4, int w, int h) {
-  const int bd = 8;
-  int x, y, k;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] =
-          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
-                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_q4,
-                                      int y_step_q4, int w, int h) {
-  const int bd = 8;
-  int x, y, k;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *const x_filters, int x0_q4,
-                                 int x_step_q4,
-                                 const InterpKernel *const y_filters, int y0_q4,
-                                 int y_step_q4, int w, int h) {
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                             x_step_q4, w, intermediate_height);
-  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                             x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                            y_step_q4, w, h);
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
 }
 
-void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
 }
-#endif  // CONFIG_LOOP_RESTORATION
 
-// TODO(afergs): Make sure this works too
-#if CONFIG_HIGHBITDEPTH
 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                   uint8_t *dst8, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h, int bd) {
-  int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
+  for (int y = 0; y < h; ++y) {
     int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
+    for (int x = 0; x < w; ++x) {
       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      const int sum = highbd_horz_scalar_product(src_x, x_filter);
       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_q4 += x_step_q4;
     }
@@ -820,47 +167,19 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                      uint8_t *dst8, ptrdiff_t dst_stride,
-                                      const InterpKernel *x_filters, int x0_q4,
-                                      int x_step_q4, int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-          1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h, int bd) {
-  int x, y;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
+  for (int x = 0; x < w; ++x) {
     int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
+    for (int y = 0; y < h; ++y) {
       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
+      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
       dst[y * dst_stride] =
           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       y_q4 += y_step_q4;
@@ -870,67 +189,6 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                     uint8_t *dst8, ptrdiff_t dst_stride,
-                                     const InterpKernel *y_filters, int y0_q4,
-                                     int y_step_q4, int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-          1);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h, int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
-                        x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_vert(
-      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
-}
-
 void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
@@ -945,20 +203,6 @@ void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                         x_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                            x_step_q4, w, h, bd);
-}
-
 void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
@@ -973,51 +217,6 @@ void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                        y_step_q4, w, h, bd);
 }
 
-void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                           y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
-                                int h, int bd) {
-  // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
-                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
-                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
-}
-
 void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int filter_x_stride,
@@ -1038,295 +237,3 @@ void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
     dst += dst_stride;
   }
 }
-
-void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst8, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
-  int x, y;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-#if CONFIG_LOOP_RESTORATION
-static void highbd_convolve_add_src_horiz(const uint8_t *src8,
-                                          ptrdiff_t src_stride, uint8_t *dst8,
-                                          ptrdiff_t dst_stride,
-                                          const InterpKernel *x_filters,
-                                          int x0_q4, int x_step_q4, int w,
-                                          int h, int bd) {
-  int x, y, k;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
-          bd);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_add_src_vert(const uint8_t *src8,
-                                         ptrdiff_t src_stride, uint8_t *dst8,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *y_filters,
-                                         int y0_q4, int y_step_q4, int w, int h,
-                                         int bd) {
-  int x, y, k;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] =
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                                src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
-                            bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *const x_filters,
-                                    int x0_q4, int x_step_q4,
-                                    const InterpKernel *const y_filters,
-                                    int y0_q4, int y_step_q4, int w, int h,
-                                    int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                                src_stride, CONVERT_TO_BYTEPTR(temp),
-                                MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                                intermediate_height, bd);
-  highbd_convolve_add_src_vert(
-      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_horiz_c(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
-                                x0_q4, x_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
-                                         ptrdiff_t src_stride, uint8_t *dst,
-                                         ptrdiff_t dst_stride,
-                                         const int16_t *filter_x, int x_step_q4,
-                                         const int16_t *filter_y, int y_step_q4,
-                                         int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
-                               y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                          x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-static void highbd_convolve_add_src_horiz_hip(
-    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
-    int x_step_q4, int w, int h, int bd) {
-  const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
-  int x, y, k;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                (1 << (bd + FILTER_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-      dst[x] =
-          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
-                          0, extraprec_clamp_limit - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_add_src_vert_hip(
-    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
-    int y_step_q4, int w, int h, int bd) {
-  int x, y, k;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      int sum =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
-      for (k = 0; k < SUBPEL_TAPS; ++k)
-        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_add_src_hip(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
-    int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
-    int y_step_q4, int w, int h, int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_add_src_horiz_hip(
-      src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
-      x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                                   MAX_SB_SIZE, dst, dst_stride, y_filters,
-                                   y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_horiz_hip_c(
-    const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
-                                    x0_q4, x_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_vert_hip_c(
-    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
-                                   y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
-                                        ptrdiff_t src_stride, uint8_t *dst,
-                                        ptrdiff_t dst_stride,
-                                        const int16_t *filter_x, int x_step_q4,
-                                        const int16_t *filter_y, int y_step_q4,
-                                        int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
-                              x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
-                              h, bd);
-}
-
-#endif  // CONFIG_LOOP_RESTORATION
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h
index c7943dced..6f5b888e4 100644
--- a/third_party/aom/aom_dsp/aom_convolve.h
+++ b/third_party/aom/aom_dsp/aom_convolve.h
@@ -11,7 +11,8 @@
 #ifndef AOM_DSP_AOM_CONVOLVE_H_
 #define AOM_DSP_AOM_CONVOLVE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
@@ -30,16 +31,11 @@ extern "C" {
 // --Must round-up because block may be located at sub-pixel position.
 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+// TODO(wtc): Update the above comment to explain the value 263 used in aom.
 #define MAX_EXT_SIZE 263
-#else
-#define MAX_EXT_SIZE 135
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
-#if CONFIG_AV1 && CONFIG_LOOP_RESTORATION
 #define EXTRAPREC_BITS 2
 #define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS))
-#endif
 
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -47,13 +43,11 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h);
 
-#if CONFIG_HIGHBITDEPTH
 typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h, int bd);
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
index 11b55caa7..768875f7d 100644
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -1,475 +1,242 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-if (NOT AOM_AOM_DSP_AOM_DSP_CMAKE_)
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
+  return()
+endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
 set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
 
-set(AOM_DSP_COMMON_SOURCES
-    "${AOM_ROOT}/aom_dsp/aom_convolve.c"
-    "${AOM_ROOT}/aom_dsp/aom_convolve.h"
-    "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
-    "${AOM_ROOT}/aom_dsp/aom_filter.h"
-    "${AOM_ROOT}/aom_dsp/aom_simd.h"
-    "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
-    "${AOM_ROOT}/aom_dsp/blend.h"
-    "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
-    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
-    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
-    "${AOM_ROOT}/aom_dsp/intrapred.c"
-    "${AOM_ROOT}/aom_dsp/intrapred_common.h"
-    "${AOM_ROOT}/aom_dsp/loopfilter.c"
-    "${AOM_ROOT}/aom_dsp/prob.c"
-    "${AOM_ROOT}/aom_dsp/prob.h"
-    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/subtract.c"
-    "${AOM_ROOT}/aom_dsp/txfm_common.h"
-    "${AOM_ROOT}/aom_dsp/x86/txfm_common_intrin.h")
-
-set(AOM_DSP_COMMON_ASM_SSE2
-    "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm")
-
-set(AOM_DSP_COMMON_INTRIN_SSE2
-    "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
-    "${AOM_ROOT}/aom_dsp/x86/convolve.h"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
-    "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
-    "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
-    "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c")
-
-set(AOM_DSP_COMMON_ASM_SSSE3
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.asm")
-
-set(AOM_DSP_COMMON_INTRIN_SSSE3
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c")
-
-set(AOM_DSP_COMMON_INTRIN_SSE4_1
-    "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
-    "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
-    "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
-
-set(AOM_DSP_COMMON_INTRIN_AVX2
-    "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
-    "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
-    "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
-    "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_AVX2
-      ${AOM_DSP_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c")
-endif ()
-
-if (NOT CONFIG_EXT_PARTITION)
-  set(AOM_DSP_COMMON_ASM_NEON
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm"
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm")
-endif ()
-
-set(AOM_DSP_COMMON_ASM_NEON
-    ${AOM_DSP_COMMON_ASM_NEON}
-    "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm"
-    "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm"
-    "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_ASM_NEON
-      ${AOM_DSP_COMMON_ASM_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm"
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm")
-endif ()
-
-if (NOT CONFIG_EXT_PARTITION)
-  set(AOM_DSP_COMMON_INTRIN_NEON
-      "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c")
-endif ()
-
-set(AOM_DSP_COMMON_INTRIN_NEON
-    ${AOM_DSP_COMMON_INTRIN_NEON}
-    "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
-    "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_NEON
-      ${AOM_DSP_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c")
-endif ()
-
-if ("${AOM_TARGET_CPU}" STREQUAL "arm64")
-  if (NOT CONFIG_EXT_PARTITION)
-    set(AOM_DSP_COMMON_INTRIN_NEON
-        ${AOM_DSP_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c")
-  endif ()
-
-  set(AOM_DSP_COMMON_INTRIN_NEON
-      ${AOM_DSP_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c"
-      "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c")
-
-  if (NOT CONFIG_PARALLEL_DEBLOCKING)
-    set(AOM_DSP_COMMON_INTRIN_NEON
-        ${AOM_DSP_COMMON_INTRIN_NEON}
-        "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c"
-        "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c")
-  endif ()
-endif ()
-
-set(AOM_DSP_COMMON_INTRIN_DSPR2
-    "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_DSPR2
-      ${AOM_DSP_COMMON_INTRIN_DSPR2}
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c")
-endif ()
-
-set(AOM_DSP_COMMON_INTRIN_MSA
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_vert_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve_avg_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/fwd_dct32x32_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/idct16x16_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/idct32x32_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/idct4x4_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
-    "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/macros_msa.h"
-    "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h")
-
-if (NOT CONFIG_PARALLEL_DEBLOCKING)
-  set(AOM_DSP_COMMON_INTRIN_MSA
-      ${AOM_DSP_COMMON_INTRIN_MSA}
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c"
-      "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h")
-endif ()
-
-if (CONFIG_HIGHBITDEPTH)
-  set(AOM_DSP_COMMON_ASM_SSE2
-      ${AOM_DSP_COMMON_ASM_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm")
-
-  set(AOM_DSP_COMMON_INTRIN_SSE2
-      ${AOM_DSP_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
-
-  set(AOM_DSP_COMMON_INTRIN_SSSE3
-      ${AOM_DSP_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c")
-
-  set(AOM_DSP_COMMON_INTRIN_AVX2
-      ${AOM_DSP_COMMON_INTRIN_AVX2}
-      "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
-else ()
-  set(AOM_DSP_COMMON_INTRIN_DSPR2
-      ${AOM_DSP_COMMON_INTRIN_DSPR2}
-      "${AOM_ROOT}/aom_dsp/mips/itrans16_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans32_cols_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans32_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans4_dspr2.c"
-      "${AOM_ROOT}/aom_dsp/mips/itrans8_dspr2.c")
-endif ()
-
-if (CONFIG_ANS)
-  set(AOM_DSP_COMMON_SOURCES
-      ${AOM_DSP_COMMON_SOURCES}
-      "${AOM_ROOT}/aom_dsp/ans.h")
-else ()
-  set(AOM_DSP_COMMON_SOURCES
-      ${AOM_DSP_COMMON_SOURCES}
-      "${AOM_ROOT}/aom_dsp/entcode.c"
-      "${AOM_ROOT}/aom_dsp/entcode.h")
-endif ()
-
-if (CONFIG_AV1)
-  set(AOM_DSP_COMMON_SOURCES
-      ${AOM_DSP_COMMON_SOURCES}
-      "${AOM_ROOT}/aom_dsp/inv_txfm.c"
-      "${AOM_ROOT}/aom_dsp/inv_txfm.h")
-
-  set(AOM_DSP_COMMON_ASM_SSE2
-      ${AOM_DSP_COMMON_ASM_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
-
-  set(AOM_DSP_COMMON_INTRIN_SSE2
-      ${AOM_DSP_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.c"
-      "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h")
-endif ()
-
-if (CONFIG_AV1_DECODER)
-  set(AOM_DSP_DECODER_SOURCES
-      "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
-      "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
-      "${AOM_ROOT}/aom_dsp/bitreader.h"
-      "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
-      "${AOM_ROOT}/aom_dsp/bitreader_buffer.h")
-
-  if (CONFIG_ANS)
-    set(AOM_DSP_DECODER_SOURCES
-        ${AOM_DSP_DECODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/ansreader.h")
-  else ()
-    set(AOM_DSP_DECODER_SOURCES
-        ${AOM_DSP_DECODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
-        "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
-        "${AOM_ROOT}/aom_dsp/entdec.c"
-        "${AOM_ROOT}/aom_dsp/entdec.h")
-  endif ()
-endif ()
-
-if (CONFIG_AV1_ENCODER)
-  set(AOM_DSP_ENCODER_SOURCES
-      "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
-      "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
-      "${AOM_ROOT}/aom_dsp/bitwriter.h"
-      "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
-      "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
-      "${AOM_ROOT}/aom_dsp/psnr.c"
-      "${AOM_ROOT}/aom_dsp/psnr.h"
-      "${AOM_ROOT}/aom_dsp/sad.c"
-      "${AOM_ROOT}/aom_dsp/variance.c"
-      "${AOM_ROOT}/aom_dsp/variance.h")
-
-  set(AOM_DSP_ENCODER_ASM_SSE2
-      ${AOM_DSP_ENCODER_ASM_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm"
-      "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm")
-
-  set(AOM_DSP_ENCODER_INTRIN_SSE2
-      "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c")
-
-  set(AOM_DSP_ENCODER_ASM_SSSE3
-      "${AOM_ROOT}/aom_dsp/x86/sad_ssse3.asm")
-
-  set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-      "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
-      "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
-
-  set(AOM_DSP_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/aom_dsp/x86/sad_sse3.asm")
-  set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm")
-
-  set(AOM_DSP_ENCODER_INTRIN_AVX2
-      "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
-      "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.h"
-      "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
-      "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
-
-  if (CONFIG_AV1_ENCODER)
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/avg.c"
-        "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
-        "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
-        "${AOM_ROOT}/aom_dsp/quantize.c"
-        "${AOM_ROOT}/aom_dsp/quantize.h"
-        "${AOM_ROOT}/aom_dsp/sum_squares.c")
-
-    set(AOM_DSP_ENCODER_INTRIN_SSE2
-        ${AOM_DSP_ENCODER_INTRIN_SSE2}
-        "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_dct32_8cols_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
-        "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
-        "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c")
-
-    set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-        ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}
-        "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm"
-        "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
-
-    set(AOM_DSP_ENCODER_AVX_ASM_X86_64
-        ${AOM_DSP_ENCODER_AVX_ASM_X86_64}
-        "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
-
-    set(AOM_DSP_ENCODER_INTRIN_MSA
-        "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
-        "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
-        "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
-        "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
-
-      set(AOM_DSP_ENCODER_INTRIN_SSSE3
-          ${AOM_DSP_ENCODER_INTRIN_SSSE3}
-          "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
-          "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c")
-
-    if (CONFIG_HIGHBITDEPTH)
-      set(AOM_DSP_ENCODER_INTRIN_SSE2
-          ${AOM_DSP_ENCODER_INTRIN_SSE2}
-          "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c")
-    endif ()
-  endif ()
-
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_DSP_ENCODER_ASM_SSE2
-        ${AOM_DSP_ENCODER_ASM_SSE2}
-        "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
-        "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm")
-
-    set(AOM_DSP_ENCODER_INTRIN_SSE2
-        ${AOM_DSP_ENCODER_INTRIN_SSE2}
-        "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
-
-    set(AOM_DSP_ENCODER_INTRIN_SSE4_1
-        ${AOM_DSP_ENCODER_INTRIN_SSE4_1}
-        "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
-
-    set(AOM_DSP_ENCODER_INTRIN_AVX2
-        ${AOM_DSP_ENCODER_INTRIN_AVX2}
-        "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c")
-  endif ()
-
-  if (CONFIG_ANS)
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/answriter.h"
-        "${AOM_ROOT}/aom_dsp/buf_ans.c"
-        "${AOM_ROOT}/aom_dsp/buf_ans.h")
-  else ()
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
-        "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
-        "${AOM_ROOT}/aom_dsp/entenc.c"
-        "${AOM_ROOT}/aom_dsp/entenc.h")
-  endif ()
-
-  if (CONFIG_INTERNAL_STATS)
-    set(AOM_DSP_ENCODER_SOURCES
-        ${AOM_DSP_ENCODER_SOURCES}
-        "${AOM_ROOT}/aom_dsp/fastssim.c"
-        "${AOM_ROOT}/aom_dsp/psnrhvs.c"
-        "${AOM_ROOT}/aom_dsp/ssim.c"
-        "${AOM_ROOT}/aom_dsp/ssim.h")
-  endif ()
-endif ()
-
-if (CONFIG_LOOP_RESTORATION)
-  set(AOM_DSP_COMMON_INTRIN_SSE2
-      ${AOM_DSP_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c")
-
-  if (CONFIG_HIGHBITDEPTH)
-    set(AOM_DSP_COMMON_INTRIN_SSSE3
-      ${AOM_DSP_COMMON_INTRIN_SSSE3}
-        "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
-  endif ()
-endif ()
-
-if (CONFIG_MOTION_VAR)
-  set(AOM_DSP_ENCODER_INTRIN_SSE4_1
-      ${AOM_DSP_ENCODER_INTRIN_SSE4_1}
-      "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
-      "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
-endif ()
+list(APPEND AOM_DSP_COMMON_SOURCES
+            "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+            "${AOM_ROOT}/aom_dsp/aom_convolve.h"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+            "${AOM_ROOT}/aom_dsp/aom_filter.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+            "${AOM_ROOT}/aom_dsp/blend.h"
+            "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+            "${AOM_ROOT}/aom_dsp/entcode.c"
+            "${AOM_ROOT}/aom_dsp/entcode.h"
+            "${AOM_ROOT}/aom_dsp/fft.c"
+            "${AOM_ROOT}/aom_dsp/fft_common.h"
+            "${AOM_ROOT}/aom_dsp/intrapred.c"
+            "${AOM_ROOT}/aom_dsp/intrapred_common.h"
+            "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/prob.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/subtract.c"
+            "${AOM_ROOT}/aom_dsp/txfm_common.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_MSA
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_DSP_DECODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+              "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+              "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
+              "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
+              "${AOM_ROOT}/aom_dsp/entdec.c"
+              "${AOM_ROOT}/aom_dsp/entdec.h"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_DSP_ENCODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+              "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+              "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
+              "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
+              "${AOM_ROOT}/aom_dsp/entenc.c"
+              "${AOM_ROOT}/aom_dsp/entenc.h"
+              "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.h"
+              "${AOM_ROOT}/aom_dsp/noise_model.c"
+              "${AOM_ROOT}/aom_dsp/noise_model.h"
+              "${AOM_ROOT}/aom_dsp/noise_util.c"
+              "${AOM_ROOT}/aom_dsp/noise_util.h"
+              "${AOM_ROOT}/aom_dsp/psnr.c"
+              "${AOM_ROOT}/aom_dsp/psnr.h"
+              "${AOM_ROOT}/aom_dsp/quantize.c"
+              "${AOM_ROOT}/aom_dsp/quantize.h"
+              "${AOM_ROOT}/aom_dsp/sad.c"
+              "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sum_squares.c"
+              "${AOM_ROOT}/aom_dsp/variance.c"
+              "${AOM_ROOT}/aom_dsp/variance.h")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+              "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
+
+  if(CONFIG_INTERNAL_STATS)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
+                "${AOM_ROOT}/aom_dsp/ssim.h")
+  endif()
+endif()
 
 # Creates aom_dsp build targets. Must not be called until after libaom target
 # has been created.
-function (setup_aom_dsp_targets)
+function(setup_aom_dsp_targets)
   add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
   list(APPEND AOM_LIB_TARGETS aom_dsp_common)
   create_dummy_source_file("aom_av1" "c" "dummy_source_file")
@@ -481,113 +248,97 @@ function (setup_aom_dsp_targets)
   # dummy source file to the aom_dsp target.
   add_dummy_source_file_to_target("aom_dsp" "c")
 
-  if (CONFIG_AV1_DECODER)
+  if(CONFIG_AV1_DECODER)
     add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
-    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder)
+    list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
-  endif ()
+  endif()
 
-  if (CONFIG_AV1_ENCODER)
+  if(CONFIG_AV1_ENCODER)
     add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
-    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder)
+    list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
     target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
-  endif ()
+  endif()
 
-  if (HAVE_SSE2)
+  if(HAVE_SSE2)
     add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
     add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
-                                   "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
+                                  "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
 
-    if (CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2"
-                      "aom")
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
       add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
                                     "AOM_DSP_ENCODER_INTRIN_SSE2" "aom")
     endif()
-  endif ()
-
-  if (HAVE_SSE3 AND CONFIG_AV1_ENCODER)
-    add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom")
-  endif ()
+  endif()
 
-  if (HAVE_SSSE3)
+  if(HAVE_SSSE3)
     add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
     add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_SSSE3" "aom")
 
-    if (CONFIG_AV1_ENCODER)
-      if ("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
         list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
-             ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
-      endif ()
+                    ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+      endif()
       add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
-      if (AOM_DSP_ENCODER_INTRIN_SSSE3)
-        add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
-                                      "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
-      endif ()
-    endif ()
-  endif ()
-
-  if (HAVE_SSE4_1)
+      add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
     add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom")
-    if (CONFIG_AV1_ENCODER)
-      if (AOM_DSP_ENCODER_INTRIN_SSE4_1)
-        add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
-                                      "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
-      endif ()
-      add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1"
-                      "aom")
-    endif ()
-  endif ()
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
+    endif()
+  endif()
 
-  if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
-    if (CONFIG_AV1_ENCODER)
+  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
       add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
                       "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
-  if (HAVE_AVX2)
+  if(HAVE_AVX2)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_AVX2" "aom")
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
                                     "AOM_DSP_ENCODER_INTRIN_AVX2" "aom")
-    endif ()
-  endif ()
-
-  if (HAVE_NEON_ASM)
-    if (AOM_ADS2GAS_REQUIRED)
-      add_gas_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom")
-    else ()
-      add_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom")
-    endif ()
-  endif ()
-
-  if (HAVE_NEON)
+    endif()
+  endif()
+
+  if(HAVE_NEON)
     add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
                                   "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON"
                                   "aom")
-  endif ()
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON" "aom")
+    endif()
+  endif()
 
-  if (HAVE_DSPR2)
+  if(HAVE_DSPR2)
     add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_DSPR2" "aom")
-  endif ()
+  endif()
 
-  if (HAVE_MSA)
+  if(HAVE_MSA)
     add_intrinsics_object_library("" "msa" "aom_dsp_common"
                                   "AOM_DSP_COMMON_INTRIN_MSA" "aom")
-    if (CONFIG_AV1_ENCODER)
+    if(CONFIG_AV1_ENCODER)
       add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
                                     "AOM_DSP_ENCODER_INTRIN_MSA" "aom")
-    endif ()
-  endif ()
+    endif()
+  endif()
 
   # Pass the new lib targets up to the parent scope instance of
   # $AOM_LIB_TARGETS.
   set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction ()
-
-endif ()  # AOM_AOM_DSP_AOM_DSP_CMAKE_
+endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk
deleted file mode 100644
index 950db0216..000000000
--- a/third_party/aom/aom_dsp/aom_dsp.mk
+++ /dev/null
@@ -1,439 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-DSP_SRCS-yes += aom_dsp.mk
-DSP_SRCS-yes += aom_dsp_common.h
-
-DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
-
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/synonyms.h
-
-# bit reader
-DSP_SRCS-yes += prob.h
-DSP_SRCS-yes += prob.c
-DSP_SRCS-$(CONFIG_ANS) += ans.h
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-ifeq ($(CONFIG_ANS),yes)
-DSP_SRCS-yes += answriter.h
-DSP_SRCS-yes += buf_ans.h
-DSP_SRCS-yes += buf_ans.c
-else
-DSP_SRCS-yes += entenc.c
-DSP_SRCS-yes += entenc.h
-DSP_SRCS-yes += daalaboolwriter.c
-DSP_SRCS-yes += daalaboolwriter.h
-endif
-DSP_SRCS-yes += bitwriter.h
-DSP_SRCS-yes += bitwriter_buffer.c
-DSP_SRCS-yes += bitwriter_buffer.h
-DSP_SRCS-yes += binary_codes_writer.c
-DSP_SRCS-yes += binary_codes_writer.h
-DSP_SRCS-yes += psnr.c
-DSP_SRCS-yes += psnr.h
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
-DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
-endif
-
-ifeq ($(CONFIG_AV1_DECODER),yes)
-ifeq ($(CONFIG_ANS),yes)
-DSP_SRCS-yes += ansreader.h
-else
-DSP_SRCS-yes += entdec.c
-DSP_SRCS-yes += entdec.h
-DSP_SRCS-yes += daalaboolreader.c
-DSP_SRCS-yes += daalaboolreader.h
-endif
-DSP_SRCS-yes += bitreader.h
-DSP_SRCS-yes += bitreader_buffer.c
-DSP_SRCS-yes += bitreader_buffer.h
-DSP_SRCS-yes += binary_codes_reader.c
-DSP_SRCS-yes += binary_codes_reader.h
-endif
-
-# intra predictions
-DSP_SRCS-yes += intrapred.c
-DSP_SRCS-yes += intrapred_common.h
-
-ifneq ($(CONFIG_ANS),yes)
-DSP_SRCS-yes += entcode.c
-DSP_SRCS-yes += entcode.h
-endif
-
-DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-
-DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.c
-DSP_SRCS-$(HAVE_AVX2) += x86/intrapred_avx2.c
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c
-endif  # CONFIG_HIGHBITDEPTH
-
-DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
-DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
-DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
-
-DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
-
-# inter predictions
-DSP_SRCS-yes            += blend.h
-DSP_SRCS-yes            += blend_a64_mask.c
-DSP_SRCS-yes            += blend_a64_hmask.c
-DSP_SRCS-yes            += blend_a64_vmask.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
-
-# interpolation filters
-DSP_SRCS-yes += aom_convolve.c
-DSP_SRCS-yes += aom_convolve.h
-DSP_SRCS-yes += aom_filter.h
-
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_8t_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_bilinear_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm
-DSP_SRCS-$(HAVE_AVX2)  += x86/aom_subpixel_8t_intrin_avx2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_8t_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_bilinear_sse2.asm
-DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
-endif
-DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm
-
-ifneq ($(CONFIG_EXT_PARTITION),yes)
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/aom_convolve_neon.c
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/aom_convolve_copy_neon.c
-DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c
-DSP_SRCS-yes += arm/aom_convolve8_neon.c
-DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
-DSP_SRCS-yes += arm/aom_convolve_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-endif  # CONFIG_EXT_PARTITION
-
-# common (msa)
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h
-
-# common (dspr2)
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_vert_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
-
-# loop filters
-DSP_SRCS-yes += loopfilter.c
-
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
-DSP_SRCS-$(HAVE_SSE2)                += x86/lpf_common_sse2.h
-
-ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes)
-DSP_SRCS-$(HAVE_AVX2)   += x86/loopfilter_avx2.c
-
-DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
-DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
-DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
-DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes   += arm/loopfilter_16_neon.c
-DSP_SRCS-yes   += arm/loopfilter_8_neon.c
-DSP_SRCS-yes   += arm/loopfilter_4_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_macros_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_masks_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
-endif  # !CONFIG_PARALLEL_DEBLOCKING
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_loopfilter_avx2.c
-endif  # CONFIG_HIGHBITDEPTH
-
-DSP_SRCS-yes            += txfm_common.h
-DSP_SRCS-yes            += x86/txfm_common_intrin.h
-DSP_SRCS-$(HAVE_AVX2)   += x86/common_avx2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
-DSP_SRCS-$(HAVE_SSSE3)  += x86/obmc_intrinsic_ssse3.h
-DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
-
-# forward transform
-ifneq ($(findstring yes,$(CONFIG_AV1)$(CONFIG_PVQ)),)
-DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-yes            += fwd_txfm.c
-DSP_SRCS-yes            += fwd_txfm.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32_8cols_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
-endif
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
-DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
-DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
-endif  # CONFIG_AV1_ENCODER
-endif  # CONFIG_AV1
-
-# inverse transform
-ifeq ($(CONFIG_AV1), yes)
-DSP_SRCS-yes            += inv_txfm.h
-DSP_SRCS-yes            += inv_txfm.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
-DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_common_avx2.h
-DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_avx2.c
-
-ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
-else
-ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
-DSP_SRCS-yes  += arm/idct4x4_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_add_neon.c
-endif  # HAVE_NEON
-endif  # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
-
-DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
-DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
-DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
-DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
-DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
-
-ifneq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
-DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif  # CONFIG_HIGHBITDEPTH
-
-ifeq ($(CONFIG_LOOP_RESTORATION),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/aom_convolve_hip_sse2.c
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/aom_highbd_convolve_hip_ssse3.c
-endif
-endif  # CONFIG_LOOP_RESTORATION
-endif  # CONFIG_AV1
-
-# quantization
-ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
-DSP_SRCS-yes            += quantize.c
-DSP_SRCS-yes            += quantize.h
-
-DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
-
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
-
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
-DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
-endif
-
-# avg
-DSP_SRCS-yes           += avg.c
-DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
-DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
-endif
-
-# high bit depth subtract
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
-endif
-
-endif  # CONFIG_AV1_ENCODER
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-yes            += sum_squares.c
-
-DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
-endif # CONFIG_AV1_ENCODER
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-yes            += sad.c
-DSP_SRCS-yes            += subtract.c
-
-DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
-
-DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
-
-DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
-DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_AVX2)   += x86/sad_highbd_avx2.c
-endif
-
-ifeq ($(CONFIG_AV1_ENCODER),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
-DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
-ifeq ($(CONFIG_MOTION_VAR),yes)
-DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
-endif  #CONFIG_MOTION_VAR
-ifeq ($(CONFIG_EXT_PARTITION),yes)
-DSP_SRCS-$(HAVE_AVX2) += x86/sad_impl_avx2.c
-endif
-endif  #CONFIG_AV1_ENCODER
-
-DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
-endif  # CONFIG_HIGHBITDEPTH
-
-endif  # CONFIG_AV1_ENCODER
-
-ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
-DSP_SRCS-yes            += variance.c
-DSP_SRCS-yes            += variance.h
-
-DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
-
-DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
-DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
-
-DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
-DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_impl_sse2.asm
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
-
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
-endif  # ARCH_X86_64
-
-DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
-
-ifeq ($(CONFIG_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
-endif  # CONFIG_HIGHBITDEPTH
-endif  # CONFIG_AV1_ENCODER
-
-DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
-
-DSP_SRCS-yes += aom_dsp_rtcd.c
-DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
-
-DSP_SRCS-yes += aom_simd.h
-DSP_SRCS-yes += aom_simd_inline.h
-DSP_SRCS-yes += simd/v64_intrinsics.h
-DSP_SRCS-yes += simd/v64_intrinsics_c.h
-DSP_SRCS-yes += simd/v128_intrinsics.h
-DSP_SRCS-yes += simd/v128_intrinsics_c.h
-DSP_SRCS-yes += simd/v256_intrinsics.h
-DSP_SRCS-yes += simd/v256_intrinsics_c.h
-DSP_SRCS-yes += simd/v256_intrinsics_v128.h
-DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
-DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
-DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
-DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
-DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
-DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
-
-$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
index 3d3bcba37..c5dc9a834 100644
--- a/third_party/aom/aom_dsp/aom_dsp_common.h
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -12,7 +12,8 @@
 #ifndef AOM_DSP_AOM_DSP_COMMON_H_
 #define AOM_DSP_AOM_DSP_COMMON_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
@@ -21,11 +22,7 @@ extern "C" {
 #endif
 
 #ifndef MAX_SB_SIZE
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE 128
-#else
-#define MAX_SB_SIZE 64
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 #endif  // ndef MAX_SB_SIZE
 
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
@@ -52,22 +49,14 @@ extern "C" {
 #define UNLIKELY(v) (v)
 #endif
 
-typedef uint16_t qm_val_t;
+typedef uint8_t qm_val_t;
 #define AOM_QM_BITS 5
 
-#if CONFIG_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
 // tran_high_t is the datatype used for intermediate transform stages.
 typedef int64_t tran_high_t;
 typedef int32_t tran_low_t;
-#else
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif  // CONFIG_HIGHBITDEPTH
 
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
@@ -77,10 +66,6 @@ static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-static INLINE uint32_t clamp32u(uint32_t value, uint32_t low, uint32_t high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
 static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
   return value < low ? low : (value > high ? high : value);
 }
@@ -98,6 +83,14 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   }
 }
 
+// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
+// or max(0, value) and might be faster in some cases.
+// Care should be taken since the behavior of right shifting signed type
+// negative value is undefined by C standards and implementation defined,
+static INLINE unsigned int negative_to_zero(int value) {
+  return value & ~(value >> (sizeof(value) * 8 - 1));
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
index 11a57d382..5d7d4515b 100644
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #define RTCD_C
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/aom_once.h"
 
 void aom_dsp_rtcd() { once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
index f4f6c64d4..a8ac5eb5c 100755
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
 sub aom_dsp_forward_decls() {
 print <<EOF
 /*
@@ -7,6 +17,7 @@ print <<EOF
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "av1/common/enums.h"
+#include "av1/common/blockd.h"
 
 EOF
 }
@@ -28,11 +39,7 @@ if ($opts{arch} eq "x86_64") {
   $avx2_x86_64 = 'avx2';
 }
 
-if (aom_config("CONFIG_EXT_PARTITION") eq "yes") {
-  @block_widths = (4, 8, 16, 32, 64, 128)
-} else {
-  @block_widths = (4, 8, 16, 32, 64)
-}
+@block_widths = (4, 8, 16, 32, 64, 128);
 
 @block_sizes = ();
 foreach $w (@block_widths) {
@@ -40,36 +47,24 @@ foreach $w (@block_widths) {
     push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
   }
 }
-if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") {
-  push @block_sizes, [4, 16];
-  push @block_sizes, [16, 4];
-  push @block_sizes, [8, 32];
-  push @block_sizes, [32, 8];
-  push @block_sizes, [16, 64];
-  push @block_sizes, [64, 16];
-  if (aom_config("CONFIG_EXT_PARTITION") eq "yes") {
-      push @block_sizes, [32, 128];
-      push @block_sizes, [128, 32];
-  }
-}
-
-@tx_dims = (2, 4, 8, 16, 32);
-if (aom_config("CONFIG_TX64X64") eq "yes") {
-  push @tx_dims, '64';
-}
-
+push @block_sizes, [4, 16];
+push @block_sizes, [16, 4];
+push @block_sizes, [8, 32];
+push @block_sizes, [32, 8];
+push @block_sizes, [16, 64];
+push @block_sizes, [64, 16];
+
+@tx_dims = (2, 4, 8, 16, 32, 64);
 @tx_sizes = ();
 foreach $w (@tx_dims) {
   push @tx_sizes, [$w, $w];
   foreach $h (@tx_dims) {
     push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
   }
 }
 
-@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153 paeth smooth/;
-if (aom_config("CONFIG_SMOOTH_HV") eq "yes") {
-  push @pred_names, qw/smooth_v smooth_h/;
-}
+@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
 
 #
 # Intra prediction
@@ -80,73 +75,125 @@ foreach (@tx_sizes) {
   foreach $pred_name (@pred_names) {
     add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
               "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
-                "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-    }
+    add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+              "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   }
 }
 
 specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_top_predictor_4x8 sse2/;
+specialize qw/aom_dc_top_predictor_4x16 sse2/;
 specialize qw/aom_dc_top_predictor_8x4 sse2/;
 specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_top_predictor_8x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x4 sse2/;
 specialize qw/aom_dc_top_predictor_16x8 sse2/;
 specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x64 sse2/;
+specialize qw/aom_dc_top_predictor_32x8 sse2/;
 specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
+specialize qw/aom_dc_left_predictor_4x16 sse2/;
 specialize qw/aom_dc_left_predictor_8x4 sse2/;
 specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_8x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x4 sse2/;
 specialize qw/aom_dc_left_predictor_16x8 sse2/;
 specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_left_predictor_16x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x64 sse2/;
+specialize qw/aom_dc_left_predictor_32x8 sse2/;
 specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_128_predictor_4x8 sse2/;
+specialize qw/aom_dc_128_predictor_4x16 sse2/;
 specialize qw/aom_dc_128_predictor_8x4 sse2/;
 specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_8x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x4 sse2/;
 specialize qw/aom_dc_128_predictor_16x8 sse2/;
 specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
 specialize qw/aom_dc_128_predictor_16x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x64 sse2/;
+specialize qw/aom_dc_128_predictor_32x8 sse2/;
 specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
 specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
+specialize qw/aom_v_predictor_4x16 sse2/;
 specialize qw/aom_v_predictor_8x4 sse2/;
 specialize qw/aom_v_predictor_8x8 neon msa sse2/;
 specialize qw/aom_v_predictor_8x16 sse2/;
+specialize qw/aom_v_predictor_8x32 sse2/;
+specialize qw/aom_v_predictor_16x4 sse2/;
 specialize qw/aom_v_predictor_16x8 sse2/;
 specialize qw/aom_v_predictor_16x16 neon msa sse2/;
 specialize qw/aom_v_predictor_16x32 sse2/;
+specialize qw/aom_v_predictor_16x64 sse2/;
+specialize qw/aom_v_predictor_32x8 sse2/;
 specialize qw/aom_v_predictor_32x16 sse2 avx2/;
 specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
 specialize qw/aom_h_predictor_4x8 sse2/;
+specialize qw/aom_h_predictor_4x16 sse2/;
 specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x4 sse2/;
 specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x16 sse2/;
+specialize qw/aom_h_predictor_8x32 sse2/;
+specialize qw/aom_h_predictor_16x4 sse2/;
 specialize qw/aom_h_predictor_16x8 sse2/;
 specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_16x32 sse2/;
+specialize qw/aom_h_predictor_16x64 sse2/;
+specialize qw/aom_h_predictor_32x8 sse2/;
 specialize qw/aom_h_predictor_32x16 sse2/;
 specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_h_predictor_32x64 sse2/;
+specialize qw/aom_h_predictor_64x64 sse2/;
+specialize qw/aom_h_predictor_64x32 sse2/;
+specialize qw/aom_h_predictor_64x16 sse2/;
 specialize qw/aom_paeth_predictor_4x4 ssse3/;
 specialize qw/aom_paeth_predictor_4x8 ssse3/;
+specialize qw/aom_paeth_predictor_4x16 ssse3/;
 specialize qw/aom_paeth_predictor_8x4 ssse3/;
 specialize qw/aom_paeth_predictor_8x8 ssse3/;
 specialize qw/aom_paeth_predictor_8x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x32 ssse3/;
+specialize qw/aom_paeth_predictor_16x4 ssse3/;
 specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x8 ssse3/;
 specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x8 ssse3/;
 specialize qw/aom_paeth_predictor_16x16 ssse3/;
 specialize qw/aom_paeth_predictor_16x32 ssse3/;
@@ -154,34 +201,86 @@ specialize qw/aom_paeth_predictor_32x16 ssse3/;
 specialize qw/aom_paeth_predictor_32x32 ssse3/;
 specialize qw/aom_smooth_predictor_4x4 ssse3/;
 specialize qw/aom_smooth_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_predictor_4x16 ssse3/;
 specialize qw/aom_smooth_predictor_8x4 ssse3/;
 specialize qw/aom_smooth_predictor_8x8 ssse3/;
 specialize qw/aom_smooth_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x4 ssse3/;
 specialize qw/aom_smooth_predictor_16x8 ssse3/;
 specialize qw/aom_smooth_predictor_16x16 ssse3/;
 specialize qw/aom_smooth_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_predictor_32x8 ssse3/;
 specialize qw/aom_smooth_predictor_32x16 ssse3/;
 specialize qw/aom_smooth_predictor_32x32 ssse3/;
-
-specialize qw/aom_d63e_predictor_4x4 ssse3/;
-specialize qw/aom_d135_predictor_4x4 neon/;
-specialize qw/aom_d153_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+
+# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+# by multiply and shift.
 specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
 specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_d153_predictor_8x8 ssse3/;
+specialize qw/aom_dc_predictor_4x16 sse2/;
 specialize qw/aom_dc_predictor_8x4 sse2/;
 specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
 specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_d153_predictor_16x16 ssse3/;
+specialize qw/aom_dc_predictor_8x32 sse2/;
+specialize qw/aom_dc_predictor_16x4 sse2/;
 specialize qw/aom_dc_predictor_16x8 sse2/;
 specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
 specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_d153_predictor_32x32 ssse3/;
-
+specialize qw/aom_dc_predictor_16x64 sse2/;
+specialize qw/aom_dc_predictor_32x8 sse2/;
 specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
 
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_4x4 sse2/;
   specialize qw/aom_highbd_v_predictor_4x8 sse2/;
   specialize qw/aom_highbd_v_predictor_8x4 sse2/;
@@ -192,16 +291,21 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_16x32 sse2/;
   specialize qw/aom_highbd_v_predictor_32x16 sse2/;
   specialize qw/aom_highbd_v_predictor_32x32 sse2/;
-  specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
+
+  # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+  # by multiply and shift.
+  specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
   specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
   specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
-  specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
+  specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;;
   specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
   specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
   specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
   specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x64 neon/;
+
   specialize qw/aom_highbd_h_predictor_4x4 sse2/;
   specialize qw/aom_highbd_h_predictor_4x8 sse2/;
   specialize qw/aom_highbd_h_predictor_8x4 sse2/;
@@ -242,253 +346,129 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
   specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
-  
-  specialize qw/aom_highbd_d117_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d117_predictor_8x8 ssse3/;
-  specialize qw/aom_highbd_d117_predictor_16x16 ssse3/;
-  specialize qw/aom_highbd_d117_predictor_32x32 ssse3/;
-  specialize qw/aom_highbd_d135_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d135_predictor_8x8 ssse3/;
-  specialize qw/aom_highbd_d135_predictor_16x16 ssse3/;
-  specialize qw/aom_highbd_d135_predictor_32x32 ssse3/;
-  specialize qw/aom_highbd_d153_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d153_predictor_8x8 ssse3/;
-  specialize qw/aom_highbd_d153_predictor_16x16 ssse3/;
-  specialize qw/aom_highbd_d153_predictor_32x32 ssse3/;
-
-  specialize qw/aom_highbd_d45e_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_d45e_predictor_16x8 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_16x16 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_16x32 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_32x16 avx2/;
-  specialize qw/aom_highbd_d45e_predictor_32x32 avx2/;
-}  # CONFIG_HIGHBITDEPTH
 
 #
 # Sub Pixel Filters
 #
 add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve_avg/,              "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8/,                 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_horiz/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_vert/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_2d/,                 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_horiz/,              "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_vert/,               "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_2d/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_horiz/,          "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_vert/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-add_proto qw/void aom_convolve8_horiz_scale/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert_scale/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_vert_scale/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_scale/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_scale/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
 
 specialize qw/aom_convolve_copy       sse2      /;
-specialize qw/aom_convolve_avg        sse2      /;
-specialize qw/aom_convolve8           sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_avg       sse2 ssse3/;
-specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
-specialize qw/aom_convolve8_avg_vert  sse2 ssse3/;
-specialize qw/aom_scaled_2d                ssse3/;
-
-if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
-  add_proto qw/void aom_convolve8_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_hip/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-  add_proto qw/void aom_convolve8_add_src_vert_hip/,  "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-  specialize qw/aom_convolve8_add_src ssse3/;
-  specialize qw/aom_convolve8_add_src_horiz ssse3/;
-  specialize qw/aom_convolve8_add_src_vert ssse3/;
-  specialize qw/aom_convolve8_add_src_hip sse2/;
-}  # CONFIG_LOOP_RESTORATION
-
-# TODO(any): These need to be extended to up to 128x128 block sizes
-if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
-  specialize qw/aom_convolve_copy       neon dspr2 msa/;
-  specialize qw/aom_convolve_avg        neon dspr2 msa/;
-  specialize qw/aom_convolve8           neon dspr2 msa/;
-  specialize qw/aom_convolve8_horiz     neon dspr2 msa/;
-  specialize qw/aom_convolve8_vert      neon dspr2 msa/;
-  specialize qw/aom_convolve8_avg       neon dspr2 msa/;
-  specialize qw/aom_convolve8_avg_horiz neon dspr2 msa/;
-  specialize qw/aom_convolve8_avg_vert  neon dspr2 msa/;
-}
 
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve_copy sse2 avx2/;
 
-  add_proto qw/void aom_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve_avg sse2 avx2/;
+add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
 
-  add_proto qw/void aom_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8 avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_avg avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_avg_horiz avx2/, "$sse2_x86_64";
-
-  add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/aom_highbd_convolve8_avg_vert avx2/, "$sse2_x86_64";
-
-  if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
-    add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-    add_proto qw/void aom_highbd_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-    specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
-    specialize qw/aom_highbd_convolve8_add_src_hip ssse3/;
-    # The _horiz/_vert functions are currently unused, so we don't bother
-    # specialising them.
-  }  # CONFIG_LOOP_RESTORATION
-}  # CONFIG_HIGHBITDEPTH
+add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
 
 #
 # Loopfilter
 #
-add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_vertical_16 sse2/;
-} else {
-  specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-  $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon;
-}
+add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_14 sse2 neon/;
 
-add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-  $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon;
-}
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_14_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_6 sse2/;
 
 add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_vertical_8 sse2/;
-} else {
-  specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_vertical_8 sse2 neon/;
 
 add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-  $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon;
-}
+specialize qw/aom_lpf_vertical_8_dual sse2/;
 
 add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_vertical_4 sse2/;
-} else {
-  specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_vertical_4 sse2/;
 
 add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_vertical_4_dual sse2/;
 
-add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_edge_8 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
-  $aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon;
-}
+add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_14 sse2/;
 
-add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_edge_16 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
-  $aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon;
-}
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_14_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_6_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_8 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_horizontal_8 sse2 neon/;
 
 add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-  $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon;
-}
+specialize qw/aom_lpf_horizontal_8_dual sse2/;
 
 add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") {
-  specialize qw/aom_lpf_horizontal_4 sse2/;
-} else {
-  specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_horizontal_4 sse2/;
 
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") {
-  specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
-}
+specialize qw/aom_lpf_horizontal_4_dual sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_8 sse2/;
 
-if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_16 sse2/;
+add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_6 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/;
+add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_6_dual sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_8 sse2/;
+add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_4 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
+specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
 
-  add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-}  # CONFIG_HIGHBITDEPTH
+add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
 
 #
 # Encoder functions.
@@ -497,170 +477,43 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
 #
 # Forward transform
 #
-if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq "yes")){
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4 sse2/;
-
-    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4_1 sse2/;
-
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
     add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct16x16 sse2/;
-
-    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32 sse2 avx2/;
-
-    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32_rd sse2 avx2/;
-
     # High bit depth
-    add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct4x4 sse2/;
-
     add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct8x8 sse2/;
 
-    add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct16x16 sse2/;
-
-    add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct32x32 sse2/;
-
-    add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
-  } else {
-    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4 sse2 msa/;
+    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
 
-    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft4x4_float                  sse2/;
 
-    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
-
-    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct16x16 sse2 msa/;
-
-    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32 sse2 avx2 msa/;
-
-    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
-  }  # CONFIG_HIGHBITDEPTH
-}  # CONFIG_AV1_ENCODER
-
-#
-# Inverse transform
-if (aom_config("CONFIG_AV1") eq "yes") {
-  add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-
-  add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_iwht4x4_16_add sse2/;
-
-  add_proto qw/void aom_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-
-  add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-
-  add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct4x4_16_add sse2/;
-
-  add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct4x4_1_add sse2/;
-
-  add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_64_add sse2 ssse3/;
-
-  add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_12_add sse2 ssse3/;
-
-  add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_1_add sse2/;
-
-  add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_256_add sse2 avx2/;
-
-  add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_38_add avx2/;
-
-  add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_10_add sse2 avx2/;
-
-  add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_1_add sse2 avx2/;
-
-  add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/;
-
-  add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/;
-  # Need to add 135 eob idct32x32 implementations.
-  $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
-
-  add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/;
-
-  add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_1_add sse2 avx2/;
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-  } else {
-    add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct4x4_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/;
+    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft8x8_float avx2             sse2/;
 
-    add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
+    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft16x16_float avx2           sse2/;
 
-    add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
+    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft32x32_float avx2           sse2/;
 
-    add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_1_add sse2 avx2 neon dspr2 msa/;
+    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
 
-    add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_256_add sse2 avx2 neon dspr2 msa/;
+    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft4x4_float                 sse2/;
 
-    add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_38_add avx2/;
+    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft8x8_float avx2            sse2/;
 
-    add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct16x16_10_add sse2 avx2 neon dspr2 msa/;
+    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft16x16_float avx2          sse2/;
 
-    add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2 neon dspr2 msa/;
-
-    add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2 neon dspr2 msa/;
-    # Need to add 135 eob idct32x32 implementations.
-    $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
-    $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
-    $aom_idct32x32_135_add_dspr2=aom_idct32x32_1024_add_dspr2;
-    $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
-
-    add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2 neon dspr2 msa/;
-    # Need to add 34 eob idct32x32 neon implementation.
-    $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
-
-    add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_1_add sse2 avx2 neon dspr2 msa/;
-
-    add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_iwht4x4_1_add msa/;
-
-    add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_iwht4x4_16_add msa sse2/;
-  }  # CONFIG_HIGHBITDEPTH
-}  # CONFIG_AV1
+    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft32x32_float avx2          sse2/;
+}  # CONFIG_AV1_ENCODER
 
 #
 # Quantization
@@ -685,29 +538,26 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
 }  # CONFIG_AV1_ENCODER
-if (aom_config("CONFIG_AV1") eq "yes") {
-  #
-  # Alpha blending with mask
-  #
-  if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
-    add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-  }
-  add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-  add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
-  add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
-  specialize "aom_blend_a64_mask", qw/sse4_1/;
-  specialize "aom_blend_a64_hmask", qw/sse4_1/;
-  specialize "aom_blend_a64_vmask", qw/sse4_1/;
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
-    add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
-    add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
-    specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-    specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-    specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-  }
-}  # CONFIG_AV1
+
+#
+# Alpha blending with mask
+#
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 neon/;
+add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
+add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+specialize "aom_blend_a64_mask", qw/sse4_1/;
+specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
+specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
+
+add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
+add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
 
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   #
@@ -716,6 +566,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
   specialize qw/aom_subtract_block neon msa sse2/;
 
+  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/aom_highbd_subtract_block sse2/;
+
   if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     #
     # Sum of Squares
@@ -729,53 +582,13 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
 
   #
-  # Avg
-  #
-  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-    #
-    # Avg
-    #
-    specialize qw/aom_avg_8x8 sse2 neon msa/;
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-      specialize qw/aom_highbd_subtract_block sse2/;
-    }
-
-    #
-    # Minmax
-    #
-    add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-    specialize qw/aom_minmax_8x8 sse2 neon/;
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-    }
-
-    add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
-
-    add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/aom_hadamard_16x16 sse2 neon/;
-
-    add_proto qw/int aom_satd/, "const int16_t *coeff, int length";
-    specialize qw/aom_satd sse2 neon/;
-
-    add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height";
-    specialize qw/aom_int_pro_row sse2 neon/;
-
-    add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width";
-    specialize qw/aom_int_pro_col sse2 neon/;
-
-    add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl";
-    specialize qw/aom_vector_var neon sse2/;
-  }  # CONFIG_AV1_ENCODER
-
-  #
   # Single block SAD / Single block Avg SAD
   #
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
     add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
   }
 
   specialize qw/aom_sad128x128    avx2          sse2/;
@@ -812,7 +625,59 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad4x8_avg          msa sse2/;
   specialize qw/aom_sad4x4_avg          msa sse2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+  specialize qw/aom_sad4x16      sse2/;
+  specialize qw/aom_sad16x4      sse2/;
+  specialize qw/aom_sad8x32      sse2/;
+  specialize qw/aom_sad32x8      sse2/;
+  specialize qw/aom_sad16x64     sse2/;
+  specialize qw/aom_sad64x16     sse2/;
+
+  specialize qw/aom_sad4x16_avg  sse2/;
+  specialize qw/aom_sad16x4_avg  sse2/;
+  specialize qw/aom_sad8x32_avg  sse2/;
+  specialize qw/aom_sad32x8_avg  sse2/;
+  specialize qw/aom_sad16x64_avg sse2/;
+  specialize qw/aom_sad64x16_avg sse2/;
+
+  specialize qw/aom_jnt_sad128x128_avg ssse3/;
+  specialize qw/aom_jnt_sad128x64_avg  ssse3/;
+  specialize qw/aom_jnt_sad64x128_avg  ssse3/;
+  specialize qw/aom_jnt_sad64x64_avg   ssse3/;
+  specialize qw/aom_jnt_sad64x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x64_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x16_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x16_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x8_avg    ssse3/;
+  specialize qw/aom_jnt_sad8x16_avg    ssse3/;
+  specialize qw/aom_jnt_sad8x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad8x4_avg     ssse3/;
+  specialize qw/aom_jnt_sad4x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad4x4_avg     ssse3/;
+
+  specialize qw/aom_jnt_sad4x16_avg     ssse3/;
+  specialize qw/aom_jnt_sad16x4_avg     ssse3/;
+  specialize qw/aom_jnt_sad8x32_avg     ssse3/;
+  specialize qw/aom_jnt_sad32x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad16x64_avg     ssse3/;
+  specialize qw/aom_jnt_sad64x16_avg     ssse3/;
+
+  add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+
+  specialize qw/aom_sad4xh   sse2/;
+  specialize qw/aom_sad8xh   sse2/;
+  specialize qw/aom_sad16xh  sse2/;
+  specialize qw/aom_sad32xh  sse2/;
+  specialize qw/aom_sad64xh  sse2/;
+  specialize qw/aom_sad128xh sse2/;
+
+
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
@@ -821,31 +686,45 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
         specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
       }
+      add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
     }
     specialize qw/aom_highbd_sad128x128 avx2/;
     specialize qw/aom_highbd_sad128x64  avx2/;
     specialize qw/aom_highbd_sad64x128  avx2/;
-    specialize qw/aom_highbd_sad64x64   avx2/;
-    specialize qw/aom_highbd_sad64x32   avx2/;
-    specialize qw/aom_highbd_sad32x64   avx2/;
-    specialize qw/aom_highbd_sad32x32   avx2/;
-    specialize qw/aom_highbd_sad32x16   avx2/;
-    specialize qw/aom_highbd_sad16x32   avx2/;
-    specialize qw/aom_highbd_sad16x16   avx2/;
-    specialize qw/aom_highbd_sad16x8    avx2/;
+    specialize qw/aom_highbd_sad64x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4     sse2/;
 
     specialize qw/aom_highbd_sad128x128_avg avx2/;
     specialize qw/aom_highbd_sad128x64_avg  avx2/;
     specialize qw/aom_highbd_sad64x128_avg  avx2/;
-    specialize qw/aom_highbd_sad64x64_avg   avx2/;
-    specialize qw/aom_highbd_sad64x32_avg   avx2/;
-    specialize qw/aom_highbd_sad32x64_avg   avx2/;
-    specialize qw/aom_highbd_sad32x32_avg   avx2/;
-    specialize qw/aom_highbd_sad32x16_avg   avx2/;
-    specialize qw/aom_highbd_sad16x32_avg   avx2/;
-    specialize qw/aom_highbd_sad16x16_avg   avx2/;
-    specialize qw/aom_highbd_sad16x8_avg    avx2/;
-  }
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4_avg     sse2/;
+
+    specialize qw/aom_highbd_sad16x4       sse2/;
+    specialize qw/aom_highbd_sad8x32       sse2/;
+    specialize qw/aom_highbd_sad32x8       sse2/;
+    specialize qw/aom_highbd_sad16x64      sse2/;
+    specialize qw/aom_highbd_sad64x16      sse2/;
+
+    specialize qw/aom_highbd_sad16x4_avg   sse2/;
+    specialize qw/aom_highbd_sad8x32_avg   sse2/;
+    specialize qw/aom_highbd_sad32x8_avg   sse2/;
+    specialize qw/aom_highbd_sad16x64_avg  sse2/;
+    specialize qw/aom_highbd_sad64x16_avg  sse2/;
 
   #
   # Masked SAD
@@ -856,90 +735,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
   }
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
     foreach (@block_sizes) {
       ($w, $h) = @$_;
       add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
       specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
     }
-  }
+
 
   #
   # OBMC SAD
   #
-  if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
+    }
+  }
+
+
     foreach (@block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
       if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-         specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
-      }
-    }
-
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-        if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
-        }
+        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
       }
     }
-  }
 
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  foreach $s (@block_widths) {
-    add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  }
-  specialize qw/aom_sad64x64x3            msa/;
-  specialize qw/aom_sad32x32x3            msa/;
-  specialize qw/aom_sad16x16x3 sse3 ssse3 msa/;
-  specialize qw/aom_sad8x8x3   sse3       msa/;
-  specialize qw/aom_sad4x4x3   sse3       msa/;
-
-  add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad16x8x3 sse3 ssse3 msa/;
-  add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad8x16x3 sse3 msa/;
-
-  # Blocks of 8
-  foreach $s (@block_widths) {
-    add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  }
-  specialize qw/aom_sad64x64x8        msa/;
-  specialize qw/aom_sad32x32x8        msa/;
-  specialize qw/aom_sad16x16x8 sse4_1 msa/;
-  specialize qw/aom_sad8x8x8   sse4_1 msa/;
-  specialize qw/aom_sad4x4x8   sse4_1 msa/;
-
-  add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad16x8x8 sse4_1 msa/;
-  add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad8x16x8 sse4_1 msa/;
-  add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad8x4x8 msa/;
-  add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_sad4x8x8 msa/;
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    foreach $s (@block_widths) {
-      # Blocks of 3
-      add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-      # Blocks of 8
-      add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    }
-    # Blocks of 3
-    add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    # Blocks of 8
-    add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-    add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  }
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
@@ -966,29 +789,47 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sad4x8x4d               msa sse2/;
   specialize qw/aom_sad4x4x4d               msa sse2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    #
-    # Multi-block SAD, comparing a reference to N independent blocks
-    #
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-      if ($w != 128 && $h != 128) {
-        specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
-      }
+  specialize qw/aom_sad4x16x4d  sse2/;
+  specialize qw/aom_sad16x4x4d  sse2/;
+  specialize qw/aom_sad8x32x4d  sse2/;
+  specialize qw/aom_sad32x8x4d  sse2/;
+  specialize qw/aom_sad16x64x4d sse2/;
+  specialize qw/aom_sad64x16x4d sse2/;
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
     }
-    specialize qw/aom_highbd_sad128x128x4d avx2/;
-    specialize qw/aom_highbd_sad128x64x4d  avx2/;
-    specialize qw/aom_highbd_sad64x128x4d  avx2/;
-    specialize qw/aom_highbd_sad64x64x4d   avx2/;
-    specialize qw/aom_highbd_sad64x32x4d   avx2/;
-    specialize qw/aom_highbd_sad32x64x4d   avx2/;
-    specialize qw/aom_highbd_sad32x32x4d   avx2/;
-    specialize qw/aom_highbd_sad32x16x4d   avx2/;
-    specialize qw/aom_highbd_sad16x32x4d   avx2/;
-    specialize qw/aom_highbd_sad16x16x4d   avx2/;
-    specialize qw/aom_highbd_sad16x8x4d    avx2/;
   }
+  specialize qw/aom_highbd_sad128x128x4d avx2/;
+  specialize qw/aom_highbd_sad128x64x4d  avx2/;
+  specialize qw/aom_highbd_sad64x128x4d  avx2/;
+  specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
+  specialize qw/aom_highbd_sad8x16x4d    sse2/;
+  specialize qw/aom_highbd_sad8x8x4d     sse2/;
+  specialize qw/aom_highbd_sad8x4x4d     sse2/;
+  specialize qw/aom_highbd_sad4x8x4d     sse2/;
+  specialize qw/aom_highbd_sad4x4x4d     sse2/;
+
+  specialize qw/aom_highbd_sad4x16x4d  sse2/;
+  specialize qw/aom_highbd_sad16x4x4d  sse2/;
+  specialize qw/aom_highbd_sad8x32x4d  sse2/;
+  specialize qw/aom_highbd_sad32x8x4d  sse2/;
+  specialize qw/aom_highbd_sad16x64x4d sse2/;
+  specialize qw/aom_highbd_sad64x16x4d sse2/;
+
 
   #
   # Structured Similarity (SSIM)
@@ -1000,9 +841,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
     specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
 
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    }
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+
   }
 }  # CONFIG_AV1_ENCODER
 
@@ -1015,8 +855,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
-  specialize qw/aom_get16x16var sse2 avx2 neon msa/;
-  specialize qw/aom_get8x8var   sse2      neon msa/;
+  specialize qw/aom_get16x16var           neon msa/;
+  specialize qw/aom_get8x8var             neon msa/;
 
 
   add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
@@ -1029,7 +869,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_mse8x16           sse2           msa/;
   specialize qw/aom_mse8x8            sse2           msa/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
     foreach $bd (8, 10, 12) {
       add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
       add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
@@ -1042,25 +881,48 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
       specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
       specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
     }
-  }
+
 
   #
-  # ...
   #
-  add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride";
+  #
+  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride";
   specialize qw/aom_upsampled_pred sse2/;
-  add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride";
+
+  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                   int ref_stride";
   specialize qw/aom_comp_avg_upsampled_pred sse2/;
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd";
-    specialize qw/aom_highbd_upsampled_pred sse2/;
-    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd";
-    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-  }
+  add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
 
+
+  add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                 const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3,
+                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+  specialize qw/aom_highbd_upsampled_pred sse2/;
+
+  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                          const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd";
+  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                              const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+                                                              int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
+
+
+  #
   #
-  # ...
   #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
   add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
@@ -1082,27 +944,33 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
     add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
   }
-
+  specialize qw/aom_variance128x128   sse2 avx2         /;
+  specialize qw/aom_variance128x64    sse2 avx2         /;
+  specialize qw/aom_variance64x128    sse2 avx2         /;
   specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x64     sse2      neon msa/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
   specialize qw/aom_variance32x16     sse2 avx2 msa/;
-  specialize qw/aom_variance16x32     sse2      msa/;
+  specialize qw/aom_variance16x32     sse2 avx2 msa/;
   specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x8      sse2      neon msa/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
   specialize qw/aom_variance8x16      sse2      neon msa/;
   specialize qw/aom_variance8x8       sse2      neon msa/;
   specialize qw/aom_variance8x4       sse2           msa/;
   specialize qw/aom_variance4x8       sse2           msa/;
   specialize qw/aom_variance4x4       sse2           msa/;
 
+  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32               msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64               msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16               msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x32               msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x16          neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance16x8                msa sse2 ssse3/;
@@ -1112,73 +980,100 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
 
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x32      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x64      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x16      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x32      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x16      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x8       msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x16       msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x8        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x4        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x8        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x4        msa sse2 ssse3/;
-
-  if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") {
-    specialize qw/aom_variance4x16 sse2/;
-    specialize qw/aom_variance16x4 sse2/;
-    specialize qw/aom_variance8x32 sse2/;
-    specialize qw/aom_variance32x8 sse2/;
-    specialize qw/aom_variance16x64 sse2/;
-    specialize qw/aom_variance64x16 sse2/;
-    specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-    specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
-  }
-
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    foreach $bd (8, 10, 12) {
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
+
+  specialize qw/aom_variance4x16 sse2/;
+  specialize qw/aom_variance16x4 sse2 avx2/;
+  specialize qw/aom_variance8x32 sse2/;
+  specialize qw/aom_variance32x8 sse2 avx2/;
+  specialize qw/aom_variance16x64 sse2 avx2/;
+  specialize qw/aom_variance64x16 sse2 avx2/;
+  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x8  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x16  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x8   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x4   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x8   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x4   ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x16  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x4  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x32  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x8  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance128x128  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance128x64   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x128   ssse3/;
+
+
+  foreach $bd (8, 10, 12) {
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-        if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
-        }
-        # TODO(david.barker): When ext-partition-types is enabled, we currently
-        # don't have vectorized 4x16 highbd variance functions
-        if ($w == 4 && $h == 4) {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      # TODO(david.barker): When ext-partition-types is enabled, we currently
+      # don't have vectorized 4x16 highbd variance functions
+      if ($w == 4 && $h == 4) {
           specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
         }
-        if ($w != 128 && $h != 128 && $w != 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
-        }
-        if ($w == 4 && $h == 4) {
-          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
-          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
-        }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
       }
+
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
     }
-  }  # CONFIG_HIGHBITDEPTH
+  }
 
   #
   # Masked Variance / Masked Subpixel Variance
@@ -1189,7 +1084,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
   }
 
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
     foreach $bd ("_8_", "_10_", "_12_") {
       foreach (@block_sizes) {
         ($w, $h) = @$_;
@@ -1197,30 +1092,28 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
         specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
       }
     }
-  }
+
 
   #
   # OBMC Variance / OBMC Subpixel Variance
   #
-  if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-      add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-      specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
-    }
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
+  }
 
-    if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-      foreach $bd ("_", "_10_", "_12_") {
-        foreach (@block_sizes) {
-          ($w, $h) = @$_;
-          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
-        }
+
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
       }
     }
-  }
+
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
@@ -1260,7 +1153,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-
   #
   # Specialty Subpixel
   #
@@ -1277,7 +1169,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   # Comp Avg
   #
   add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
+
+  add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_jnt_comp_avg_pred ssse3/;
+
+
     add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
     specialize qw/aom_highbd_12_variance64x64 sse2/;
 
@@ -1415,6 +1311,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
     add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
 
+    add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+    specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
+
     #
     # Subpixel Variance
     #
@@ -1634,14 +1533,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
     add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
     add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-  }  # CONFIG_HIGHBITDEPTH
+
 
   add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-    add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
-  }
+  specialize qw/aom_comp_mask_pred ssse3 avx2/;
+
+  add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+  add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+                                                           int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
+
 
 }  # CONFIG_AV1_ENCODER
 
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
index 58e8bb284..fd4f51b29 100644
--- a/third_party/aom/aom_dsp/aom_filter.h
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -31,6 +31,13 @@ extern "C" {
 #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
 #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
 
+#define RS_SUBPEL_BITS 6
+#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
+#define RS_SCALE_SUBPEL_BITS 14
+#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
+#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
+#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
+
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 #define BIL_SUBPEL_BITS 3
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
index 469fd8ed2..392b36627 100644
--- a/third_party/aom/aom_dsp/aom_simd.h
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -18,8 +18,9 @@
 #include <intrin.h>
 #endif
 
-#include "./aom_config.h"
-#include "./aom_simd_inline.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_simd_inline.h"
 
 #define SIMD_CHECK 1  // Sanity checks in C equivalents
 
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c
deleted file mode 100644
index 09429d6d2..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-                                       int16x4_t dsrc2, int16x4_t dsrc3,
-                                       int16x4_t dsrc4, int16x4_t dsrc5,
-                                       int16x4_t dsrc6, int16x4_t dsrc7,
-                                       int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y,  // unused
-                                  int y_step_q4,            // unused
-                                  int w, int h) {
-  int width;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_y;
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;                // adjust for taps
-  for (; h > 0; h -= 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 =
-        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    src += 7;
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
-    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
-      s = src;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(src + 64);
-
-      d0x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(src + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 =
-          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(src + 64 + src_stride * 2);
-
-      d = dst;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                             d23s16, d24s16, q0s16);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(src + 64 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      d = dst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-    src += src_stride * 4 - w - 7;
-    dst += dst_stride * 4 - w;
-  }
-  return;
-}
-
-void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,  // unused
-                                 int x_step_q4,            // unused
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint8x8_t d2u8, d3u8;
-  uint32x2_t d2u32, d3u32, d6u32, d7u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  uint8x16_t q1u8, q3u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_x;
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
-      d += dst_stride;
-      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
-      d += dst_stride;
-      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-      d -= dst_stride * 3;
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                             d22s16, d24s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                             d24s16, d26s16, q0s16);
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
-      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
deleted file mode 100644
index 80aef992d..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; AV1_FILTER_WEIGHT == 128
-    ; AV1_FILTER_SHIFT == 7
-
-    EXPORT  |aom_convolve8_avg_horiz_neon|
-    EXPORT  |aom_convolve8_avg_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|aom_convolve8_avg_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-aom_convolve8_avg_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-aom_convolve8_avg_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_avg_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt aom_convolve8_avg_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|aom_convolve8_avg_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-aom_convolve8_avg_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-aom_convolve8_avg_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             aom_convolve8_avg_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_avg_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
deleted file mode 100644
index 8ebffb5f9..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-                                       int16x4_t dsrc2, int16x4_t dsrc3,
-                                       int16x4_t dsrc4, int16x4_t dsrc5,
-                                       int16x4_t dsrc6, int16x4_t dsrc7,
-                                       int16x8_t q0s16) {
-  int32x4_t qdst;
-  int16x4_t d0s16, d1s16;
-
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-
-  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
-  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
-  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
-  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
-  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
-  return qdst;
-}
-
-void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y,  // unused
-                              int y_step_q4,            // unused
-                              int w, int h) {
-  int width;
-  const uint8_t *s, *psrc;
-  uint8_t *d, *pdst;
-  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
-  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
-  uint8x16_t q12u8, q13u8, q14u8, q15u8;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-  uint16x8x2_t q0x2u16;
-  uint8x8x2_t d0x2u8, d1x2u8;
-  uint32x2x2_t d0x2u32;
-  uint16x4x2_t d0x2u16, d1x2u16;
-  uint32x4x2_t q0x2u32;
-
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_y;
-
-  q0s16 = vld1q_s16(filter_x);
-
-  src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4, src += src_stride * 4,
-                dst += dst_stride * 4) {  // loop_horiz_v
-    s = src;
-    d24u8 = vld1_u8(s);
-    s += src_stride;
-    d25u8 = vld1_u8(s);
-    s += src_stride;
-    d26u8 = vld1_u8(s);
-    s += src_stride;
-    d27u8 = vld1_u8(s);
-
-    q12u8 = vcombine_u8(d24u8, d25u8);
-    q13u8 = vcombine_u8(d26u8, d27u8);
-
-    q0x2u16 =
-        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
-    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
-    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
-    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
-    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
-    d0x2u8 = vtrn_u8(d24u8, d25u8);
-    d1x2u8 = vtrn_u8(d26u8, d27u8);
-
-    __builtin_prefetch(src + src_stride * 4);
-    __builtin_prefetch(src + src_stride * 5);
-    __builtin_prefetch(src + src_stride * 6);
-
-    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16 = vmovl_u8(d0x2u8.val[1]);
-    q10u16 = vmovl_u8(d1x2u8.val[0]);
-    q11u16 = vmovl_u8(d1x2u8.val[1]);
-
-    d16u16 = vget_low_u16(q8u16);
-    d17u16 = vget_high_u16(q8u16);
-    d18u16 = vget_low_u16(q9u16);
-    d19u16 = vget_high_u16(q9u16);
-    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
-    q9u16 = vcombine_u16(d17u16, d19u16);
-
-    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w, psrc = src + 7, pdst = dst; width > 0;
-         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
-      s = psrc;
-      d28u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d29u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d31u32 = vld1_dup_u32((const uint32_t *)s);
-      s += src_stride;
-      d30u32 = vld1_dup_u32((const uint32_t *)s);
-
-      __builtin_prefetch(psrc + 64);
-
-      d0x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 =
-          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
-                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
-      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
-                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
-
-      __builtin_prefetch(psrc + 64 + src_stride);
-
-      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
-      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 =
-          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
-
-      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
-      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
-      q12u16 = vmovl_u8(d28u8);
-      q13u16 = vmovl_u8(d29u8);
-
-      __builtin_prefetch(psrc + 64 + src_stride * 2);
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                             d23s16, d24s16, q0s16);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      __builtin_prefetch(psrc + 60 + src_stride * 3);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u8 = vqmovn_u16(q1u16);
-      d3u8 = vqmovn_u16(q2u16);
-
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
-      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
-                         vreinterpret_u32_u16(d0x2u16.val[1]));
-      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
-                       vreinterpret_u8_u32(d0x2u32.val[1]));
-
-      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
-      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
-      d = pdst;
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
-      q8u16 = q9u16;
-      d20s16 = d23s16;
-      q11u16 = q12u16;
-      q9u16 = q13u16;
-      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    }
-  }
-  return;
-}
-
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,  // unused
-                             int x_step_q4,            // unused
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  int height;
-  const uint8_t *s;
-  uint8_t *d;
-  uint32x2_t d2u32, d3u32;
-  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16;
-  uint16x4_t d2u16, d3u16, d4u16, d5u16;
-  int16x8_t q0s16;
-  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
-  int32x4_t q1s32, q2s32, q14s32, q15s32;
-
-  assert(y_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_x;
-
-  src -= src_stride * 3;
-  q0s16 = vld1q_s16(filter_y);
-  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
-    s = src;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
-    s += src_stride;
-    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
-    s += src_stride;
-    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
-    s += src_stride;
-    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
-    s += src_stride;
-    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
-    s += src_stride;
-
-    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
-    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
-    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
-    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d = dst;
-    for (height = h; height > 0; height -= 4) {  // loop_vert
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
-      s += src_stride;
-      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
-      s += src_stride;
-      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
-      s += src_stride;
-
-      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
-      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
-      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
-      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
-      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
-      __builtin_prefetch(d);
-      __builtin_prefetch(d + dst_stride);
-      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                             d22s16, d24s16, q0s16);
-      __builtin_prefetch(d + dst_stride * 2);
-      __builtin_prefetch(d + dst_stride * 3);
-      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                             d24s16, d26s16, q0s16);
-      __builtin_prefetch(s);
-      __builtin_prefetch(s + src_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d26s16, d27s16, q0s16);
-      __builtin_prefetch(s + src_stride * 2);
-      __builtin_prefetch(s + src_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d27s16, d25s16, q0s16);
-
-      d2u16 = vqrshrun_n_s32(q1s32, 7);
-      d3u16 = vqrshrun_n_s32(q2s32, 7);
-      d4u16 = vqrshrun_n_s32(q14s32, 7);
-      d5u16 = vqrshrun_n_s32(q15s32, 7);
-
-      q1u16 = vcombine_u16(d2u16, d3u16);
-      q2u16 = vcombine_u16(d4u16, d5u16);
-
-      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
-      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
-      vst1_lane_u32((uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 0);
-      d += dst_stride;
-      vst1_lane_u32((uint32_t *)d, d3u32, 1);
-      d += dst_stride;
-
-      q8u16 = q10u16;
-      d18s16 = d22s16;
-      d19s16 = d24s16;
-      q10u16 = q13u16;
-      d22s16 = d25s16;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm
deleted file mode 100644
index 38207d864..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; AV1_FILTER_WEIGHT == 128
-    ; AV1_FILTER_SHIFT == 7
-
-    EXPORT  |aom_convolve8_horiz_neon|
-    EXPORT  |aom_convolve8_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|aom_convolve8_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-aom_convolve8_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-aom_convolve8_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt aom_convolve8_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|aom_convolve8_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-aom_convolve8_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-aom_convolve8_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             aom_convolve8_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             aom_convolve8_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c
deleted file mode 100644
index f05d3ceae..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-
-void aom_convolve_avg_neon(const uint8_t *src,    // r0
-                           ptrdiff_t src_stride,  // r1
-                           uint8_t *dst,          // r2
-                           ptrdiff_t dst_stride,  // r3
-                           const int16_t *filter_x, int filter_x_stride,
-                           const int16_t *filter_y, int filter_y_stride, int w,
-                           int h) {
-  uint8_t *d;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint32x2_t d0u32, d2u32;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  d = dst;
-  if (w > 32) {  // avg64
-    for (; h > 0; h -= 1) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
-      src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      q10u8 = vld1q_u8(d + 32);
-      q11u8 = vld1q_u8(d + 48);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // avg32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q8u8 = vld1q_u8(d);
-      q9u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-      q10u8 = vld1q_u8(d);
-      q11u8 = vld1q_u8(d + 16);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q8u8);
-      q1u8 = vrhaddq_u8(q1u8, q9u8);
-      q2u8 = vrhaddq_u8(q2u8, q10u8);
-      q3u8 = vrhaddq_u8(q3u8, q11u8);
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // avg16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-      q2u8 = vld1q_u8(d);
-      d += dst_stride;
-      q3u8 = vld1q_u8(d);
-      d += dst_stride;
-
-      q0u8 = vrhaddq_u8(q0u8, q2u8);
-      q1u8 = vrhaddq_u8(q1u8, q3u8);
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // avg8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d1u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(d);
-      d += dst_stride;
-      d3u8 = vld1_u8(d);
-      d += dst_stride;
-
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      q0u8 = vrhaddq_u8(q0u8, q1u8);
-
-      vst1_u8(dst, vget_low_u8(q0u8));
-      dst += dst_stride;
-      vst1_u8(dst, vget_high_u8(q0u8));
-      dst += dst_stride;
-    }
-  } else {  // avg4
-    for (; h > 0; h -= 2) {
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
-      src += src_stride;
-      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
-      src += src_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
-      d += dst_stride;
-      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
-      d += dst_stride;
-
-      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
-
-      d0u32 = vreinterpret_u32_u8(d0u8);
-      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
-      dst += dst_stride;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
deleted file mode 100644
index 43c300954..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_convolve_avg_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_convolve_avg_neon| PROC
-    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
-    mov                 r6, r2
-
-    cmp                 r4, #32
-    bgt                 avg64
-    beq                 avg32
-    cmp                 r4, #8
-    bgt                 avg16
-    beq                 avg8
-    b                   avg4
-
-avg64
-    sub                 lr, r1, #32
-    sub                 r4, r3, #32
-avg64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    pld                 [r2, r3]
-    vld1.8              {q8-q9},   [r6@128]!
-    vld1.8              {q10-q11}, [r6@128], r4
-    vrhadd.u8           q0, q0, q8
-    vrhadd.u8           q1, q1, q9
-    vrhadd.u8           q2, q2, q10
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r4
-    subs                r5, r5, #1
-    bgt                 avg64_h
-    pop                 {r4-r6, pc}
-
-avg32
-    vld1.8              {q0-q1}, [r0], r1
-    vld1.8              {q2-q3}, [r0], r1
-    vld1.8              {q8-q9},   [r6@128], r3
-    vld1.8              {q10-q11}, [r6@128], r3
-    pld                 [r0]
-    vrhadd.u8           q0, q0, q8
-    pld                 [r0, r1]
-    vrhadd.u8           q1, q1, q9
-    pld                 [r6]
-    vrhadd.u8           q2, q2, q10
-    pld                 [r6, r3]
-    vrhadd.u8           q3, q3, q11
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg32
-    pop                 {r4-r6, pc}
-
-avg16
-    vld1.8              {q0}, [r0], r1
-    vld1.8              {q1}, [r0], r1
-    vld1.8              {q2}, [r6@128], r3
-    vld1.8              {q3}, [r6@128], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q2
-    pld                 [r6]
-    pld                 [r6, r3]
-    vrhadd.u8           q1, q1, q3
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 avg16
-    pop                 {r4-r6, pc}
-
-avg8
-    vld1.8              {d0}, [r0], r1
-    vld1.8              {d1}, [r0], r1
-    vld1.8              {d2}, [r6@64], r3
-    vld1.8              {d3}, [r6@64], r3
-    pld                 [r0]
-    pld                 [r0, r1]
-    vrhadd.u8           q0, q0, q1
-    pld                 [r6]
-    pld                 [r6, r3]
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d1}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 avg8
-    pop                 {r4-r6, pc}
-
-avg4
-    vld1.32             {d0[0]}, [r0], r1
-    vld1.32             {d0[1]}, [r0], r1
-    vld1.32             {d2[0]}, [r6@32], r3
-    vld1.32             {d2[1]}, [r6@32], r3
-    vrhadd.u8           d0, d0, d2
-    vst1.32             {d0[0]}, [r2@32], r3
-    vst1.32             {d0[1]}, [r2@32], r3
-    subs                r5, r5, #2
-    bgt                 avg4
-    pop                 {r4-r6, pc}
-    ENDP
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
deleted file mode 100644
index 9e57c7176..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-
-void aom_convolve_copy_neon(const uint8_t *src,    // r0
-                            ptrdiff_t src_stride,  // r1
-                            uint8_t *dst,          // r2
-                            ptrdiff_t dst_stride,  // r3
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
-                            int h) {
-  uint8x8_t d0u8, d2u8;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  if (w > 32) {  // copy64
-    for (; h > 0; h--) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      q2u8 = vld1q_u8(src + 32);
-      q3u8 = vld1q_u8(src + 48);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      vst1q_u8(dst + 32, q2u8);
-      vst1q_u8(dst + 48, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w == 32) {  // copy32
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      q1u8 = vld1q_u8(src + 16);
-      src += src_stride;
-      q2u8 = vld1q_u8(src);
-      q3u8 = vld1q_u8(src + 16);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q1u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q2u8);
-      vst1q_u8(dst + 16, q3u8);
-      dst += dst_stride;
-    }
-  } else if (w > 8) {  // copy16
-    for (; h > 0; h -= 2) {
-      q0u8 = vld1q_u8(src);
-      src += src_stride;
-      q1u8 = vld1q_u8(src);
-      src += src_stride;
-
-      vst1q_u8(dst, q0u8);
-      dst += dst_stride;
-      vst1q_u8(dst, q1u8);
-      dst += dst_stride;
-    }
-  } else if (w == 8) {  // copy8
-    for (; h > 0; h -= 2) {
-      d0u8 = vld1_u8(src);
-      src += src_stride;
-      d2u8 = vld1_u8(src);
-      src += src_stride;
-
-      vst1_u8(dst, d0u8);
-      dst += dst_stride;
-      vst1_u8(dst, d2u8);
-      dst += dst_stride;
-    }
-  } else {  // copy4
-    for (; h > 0; h--) {
-      *(uint32_t *)dst = *(const uint32_t *)src;
-      src += src_stride;
-      dst += dst_stride;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
deleted file mode 100644
index 443d7178a..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
+++ /dev/null
@@ -1,87 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_convolve_copy_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_convolve_copy_neon| PROC
-    push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
-
-    cmp                 r4, #32
-    bgt                 copy64
-    beq                 copy32
-    cmp                 r4, #8
-    bgt                 copy16
-    beq                 copy8
-    b                   copy4
-
-copy64
-    sub                 lr, r1, #32
-    sub                 r3, r3, #32
-copy64_h
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0]!
-    vld1.8              {q2-q3}, [r0], lr
-    vst1.8              {q0-q1}, [r2@128]!
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #1
-    bgt                 copy64_h
-    pop                 {r4-r5, pc}
-
-copy32
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0-q1}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q2-q3}, [r0], r1
-    vst1.8              {q0-q1}, [r2@128], r3
-    vst1.8              {q2-q3}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy32
-    pop                 {r4-r5, pc}
-
-copy16
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {q1}, [r0], r1
-    vst1.8              {q0}, [r2@128], r3
-    vst1.8              {q1}, [r2@128], r3
-    subs                r5, r5, #2
-    bgt                 copy16
-    pop                 {r4-r5, pc}
-
-copy8
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d0}, [r0], r1
-    pld                 [r0, r1, lsl #1]
-    vld1.8              {d2}, [r0], r1
-    vst1.8              {d0}, [r2@64], r3
-    vst1.8              {d2}, [r2@64], r3
-    subs                r5, r5, #2
-    bgt                 copy8
-    pop                 {r4-r5, pc}
-
-copy4
-    ldr                 r12, [r0], r1
-    str                 r12, [r2], r3
-    subs                r5, r5, #1
-    bgt                 copy4
-    pop                 {r4-r5, pc}
-    ENDP
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c
deleted file mode 100644
index 6c2997e04..000000000
--- a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the
-   * given height and filter a multiple of 4 lines. Since this goes in to
-   * the temp buffer which has lots of extra room and is subsequently discarded
-   * this is safe if somewhat less than ideal.
-   */
-  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
-                           intermediate_height);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                          x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-  int intermediate_height = h + 7;
-
-  assert(y_step_q4 == 16);
-  assert(x_step_q4 == 16);
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
-                           intermediate_height);
-  aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-}
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c
deleted file mode 100644
index 6ff760017..000000000
--- a/third_party/aom/aom_dsp/arm/avg_neon.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
-  const uint32x4_t a = vpaddlq_u16(v_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
-int aom_satd_neon(const int16_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
-
-  do {
-    const int16x8_t src0 = vld1q_s16(coeff);
-    const int16x8_t src8 = vld1q_s16(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
-    length -= 16;
-    coeff += 16;
-  } while (length != 0);
-
-  {
-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int satd = vget_lane_s32(s1, 0);
-    return satd;
-  }
-}
-
-void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride,
-                          int height) {
-  int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
-
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
-
-    ref += ref_stride * 8;
-  }
-
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
-
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
-}
-
-int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) {
-  int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
-
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
-  }
-
-  return horizontal_add_u16x8(vec_sum);
-}
-
-// ref, src = [0, 510] - max diff = 16-bits
-// bwl = {2, 3, 4}, width = {16, 32, 64}
-int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) {
-  int width = 4 << bwl;
-  int32x4_t sse = vdupq_n_s32(0);
-  int16x8_t total = vdupq_n_s16(0);
-
-  assert(width >= 8);
-  assert((width % 8) == 0);
-
-  do {
-    const int16x8_t r = vld1q_s16(ref);
-    const int16x8_t s = vld1q_s16(src);
-    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
-    sse = vmlal_s16(sse, diff_hi, diff_hi);
-    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
-
-    ref += 8;
-    src += 8;
-    width -= 8;
-  } while (width != 0);
-
-  {
-    // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
-    // with the summation of 'sse' performed better on a Cortex-A15.
-    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
-    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
-    const int32x2_t t2 = vpadd_s32(t1, t1);
-    const int t = vget_lane_s32(t2, 0);
-    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int s = vget_lane_s32(s1, 0);
-    const int shift_factor = bwl + 2;
-    return s - ((t * t) >> shift_factor);
-  }
-}
-
-void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                         int b_stride, int *min, int *max) {
-  // Load and concatenate.
-  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
-  const uint8x16_t a23 =
-      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
-  const uint8x16_t a45 =
-      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
-  const uint8x16_t a67 =
-      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
-
-  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
-  const uint8x16_t b23 =
-      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
-  const uint8x16_t b45 =
-      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
-  const uint8x16_t b67 =
-      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
-
-  // Absolute difference.
-  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
-  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
-  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
-  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
-
-  // Max values between the Q vectors.
-  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
-  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
-  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
-  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
-
-  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
-  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
-
-  // Split to D and start doing pairwise.
-  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
-  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
-
-  // Enough runs of vpmax/min propogate the max/min values to every position.
-  ab_max = vpmax_u8(ab_max, ab_max);
-  ab_min = vpmin_u8(ab_min, ab_min);
-
-  ab_max = vpmax_u8(ab_max, ab_max);
-  ab_min = vpmin_u8(ab_min, ab_min);
-
-  ab_max = vpmax_u8(ab_max, ab_max);
-  ab_min = vpmin_u8(ab_min, ab_min);
-
-  *min = *max = 0;  // Clear high bits
-  // Store directly to avoid costly neon->gpr transfer.
-  vst1_lane_u8((uint8_t *)max, ab_max, 0);
-  vst1_lane_u8((uint8_t *)min, ab_min, 0);
-}
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 000000000..82c0b0e28
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
+                            const int16x8_t v_maxval, int16x8_t *res) {
+  int32x4_t im_res_low, im_res_high;
+  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+
+  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
+  im_res_low =
+      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+
+  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
+  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
+                          vget_high_s16(src_1));
+
+  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
+                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
+}
+
+static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
+                             int16x8_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1, src0_2, src0_3;
+  int16x8_t src1_0, src1_1, src1_2, src1_3;
+  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+
+  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
+               &src0_3);
+  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
+               &src1_3);
+
+  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
+  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
+  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
+  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+
+  uint16x8_t im_res1_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
+  uint16x8_t im_res1_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
+  uint16x8_t im_res1_2 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
+  uint16x8_t im_res1_3 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
+
+  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
+  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
+  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
+  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
+
+  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
+  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
+  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
+  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
+}
+
+static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
+                             int16x4_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1;
+  int16x8_t src1_0, src1_1;
+  uint64x2_t tu0, tu1, tu2, tu3;
+  int16x8_t mask0_1, mask2_3;
+  int16x8_t res0, res1;
+
+  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
+  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
+
+  src0_0 = vreinterpretq_s16_u64(tu0);
+  src0_1 = vreinterpretq_s16_u64(tu1);
+
+  src1_0 = vreinterpretq_s16_u64(tu2);
+  src1_1 = vreinterpretq_s16_u64(tu3);
+
+  mask0_1 = vcombine_s16(mask0, mask1);
+  mask2_3 = vcombine_s16(mask2, mask3);
+
+  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
+  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
+
+  uint16x8_t im_res_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
+  uint16x8_t im_res_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
+
+  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
+  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
+
+  uint8x8_t res_0 = vqmovun_s16(src0_0);
+  uint8x8_t res_1 = vqmovun_s16(src0_1);
+
+  vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
+                1);
+  vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
+                1);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i = 0;
+  const int bd = 8;
+  int w_tmp = w;
+  const uint8_t *mask_tmp = mask;
+  const CONV_BUF_TYPE *src0_tmp = src0;
+  const CONV_BUF_TYPE *src1_tmp = src1;
+  uint8_t *dst_tmp = dst;
+
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint8x8_t s0, s1, s2, s3;
+  uint32x2_t tu0, tu1, tu2, tu3;
+  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+  int16x8_t mask0, mask1, mask2, mask3;
+  int16x8_t mask4, mask5, mask6, mask7;
+  int32x4_t m0_32, m1_32, m2_32, m3_32;
+  int32x4_t m4_32, m5_32, m6_32, m7_32;
+  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
+  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
+  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
+  const uint16x4_t vec_zero = vdup_n_u16(0);
+  const uint16_t offset = round_offset - (1 << (round_bits - 1));
+  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const uint16x8_t vec_offset = vdupq_n_u16(offset);
+
+  if (subw == 0 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+
+          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
+          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
+          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
+          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
+
+        mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 1) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                       &t7);
+
+          mask0 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
+          mask1 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
+          mask2 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
+          mask3 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+
+          mask4 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
+          mask5 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
+          mask6 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
+          mask7 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+
+          m0_32 = vpaddlq_s16(mask0);
+          m1_32 = vpaddlq_s16(mask1);
+          m2_32 = vpaddlq_s16(mask2);
+          m3_32 = vpaddlq_s16(mask3);
+
+          m4_32 = vpaddlq_s16(mask4);
+          m5_32 = vpaddlq_s16(mask5);
+          m6_32 = vpaddlq_s16(mask6);
+          m7_32 = vpaddlq_s16(mask7);
+
+          mask0 =
+              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
+          mask1 =
+              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
+          mask2 =
+              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
+          mask3 =
+              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+        m0_32 = vpaddlq_s16(mask0);
+        m1_32 = vpaddlq_s16(mask1);
+        m2_32 = vpaddlq_s16(mask2);
+        m3_32 = vpaddlq_s16(mask3);
+
+        mask0_low = vqrshrn_n_s32(m0_32, 2);
+        mask1_low = vqrshrn_n_s32(m1_32, 2);
+        mask2_low = vqrshrn_n_s32(m2_32, 2);
+        mask3_low = vqrshrn_n_s32(m3_32, 2);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+
+          mask0 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
+          mask1 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
+          mask2 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
+          mask3 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l);
+
+        mask0 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
+        mask1 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
+        mask2 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
+        mask3 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+
+        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
+        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
+        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
+        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
+        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
+                              &tu3);
+
+        s0 = vreinterpret_u8_u32(tu0);
+        s1 = vreinterpret_u8_u32(tu1);
+        s2 = vreinterpret_u8_u32(tu2);
+        s3 = vreinterpret_u8_u32(tu3);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+
+        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
index 1cf8a3a6e..e4300c992 100644
--- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -11,7 +11,8 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_dsp/txfm_common.h"
 
 void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c
deleted file mode 100644
index 9baefae47..000000000
--- a/third_party/aom/aom_dsp/arm/hadamard_neon.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                                 int16x8_t *a6, int16x8_t *a7) {
-  const int16x8_t b0 = vaddq_s16(*a0, *a1);
-  const int16x8_t b1 = vsubq_s16(*a0, *a1);
-  const int16x8_t b2 = vaddq_s16(*a2, *a3);
-  const int16x8_t b3 = vsubq_s16(*a2, *a3);
-  const int16x8_t b4 = vaddq_s16(*a4, *a5);
-  const int16x8_t b5 = vsubq_s16(*a4, *a5);
-  const int16x8_t b6 = vaddq_s16(*a6, *a7);
-  const int16x8_t b7 = vsubq_s16(*a6, *a7);
-
-  const int16x8_t c0 = vaddq_s16(b0, b2);
-  const int16x8_t c1 = vaddq_s16(b1, b3);
-  const int16x8_t c2 = vsubq_s16(b0, b2);
-  const int16x8_t c3 = vsubq_s16(b1, b3);
-  const int16x8_t c4 = vaddq_s16(b4, b6);
-  const int16x8_t c5 = vaddq_s16(b5, b7);
-  const int16x8_t c6 = vsubq_s16(b4, b6);
-  const int16x8_t c7 = vsubq_s16(b5, b7);
-
-  *a0 = vaddq_s16(c0, c4);
-  *a1 = vsubq_s16(c2, c6);
-  *a2 = vsubq_s16(c0, c4);
-  *a3 = vaddq_s16(c2, c6);
-  *a4 = vaddq_s16(c3, c7);
-  *a5 = vsubq_s16(c3, c7);
-  *a6 = vsubq_s16(c1, c5);
-  *a7 = vaddq_s16(c1, c5);
-}
-
-// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
-// reversing transpose order which may make it easier for the compiler to
-// reconcile the vtrn.64 moves.
-static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                         int16x8_t *a6, int16x8_t *a7) {
-  // Swap 64 bit elements. Goes from:
-  // a0: 00 01 02 03 04 05 06 07
-  // a1: 08 09 10 11 12 13 14 15
-  // a2: 16 17 18 19 20 21 22 23
-  // a3: 24 25 26 27 28 29 30 31
-  // a4: 32 33 34 35 36 37 38 39
-  // a5: 40 41 42 43 44 45 46 47
-  // a6: 48 49 50 51 52 53 54 55
-  // a7: 56 57 58 59 60 61 62 63
-  // to:
-  // a04_lo: 00 01 02 03 32 33 34 35
-  // a15_lo: 08 09 10 11 40 41 42 43
-  // a26_lo: 16 17 18 19 48 49 50 51
-  // a37_lo: 24 25 26 27 56 57 58 59
-  // a04_hi: 04 05 06 07 36 37 38 39
-  // a15_hi: 12 13 14 15 44 45 46 47
-  // a26_hi: 20 21 22 23 52 53 54 55
-  // a37_hi: 28 29 30 31 60 61 62 63
-  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
-  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
-  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
-  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
-  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
-  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
-  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
-  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
-
-  // Swap 32 bit elements resulting in:
-  // a0246_lo:
-  // 00 01 16 17 32 33 48 49
-  // 02 03 18 19 34 35 50 51
-  // a1357_lo:
-  // 08 09 24 25 40 41 56 57
-  // 10 11 26 27 42 43 58 59
-  // a0246_hi:
-  // 04 05 20 21 36 37 52 53
-  // 06 07 22 23 38 39 54 55
-  // a1657_hi:
-  // 12 13 28 29 44 45 60 61
-  // 14 15 30 31 46 47 62 63
-  const int32x4x2_t a0246_lo =
-      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
-  const int32x4x2_t a1357_lo =
-      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
-  const int32x4x2_t a0246_hi =
-      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
-  const int32x4x2_t a1357_hi =
-      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
-
-  // Swap 16 bit elements resulting in:
-  // b0:
-  // 00 08 16 24 32 40 48 56
-  // 01 09 17 25 33 41 49 57
-  // b1:
-  // 02 10 18 26 34 42 50 58
-  // 03 11 19 27 35 43 51 59
-  // b2:
-  // 04 12 20 28 36 44 52 60
-  // 05 13 21 29 37 45 53 61
-  // b3:
-  // 06 14 22 30 38 46 54 62
-  // 07 15 23 31 39 47 55 63
-  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
-                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
-  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
-                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
-  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
-                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
-  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
-                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
-
-  *a0 = b0.val[0];
-  *a1 = b0.val[1];
-  *a2 = b1.val[0];
-  *a3 = b1.val[1];
-  *a4 = b2.val[0];
-  *a5 = b2.val[1];
-  *a6 = b3.val[0];
-  *a7 = b3.val[1];
-}
-
-void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
-                           int16_t *coeff) {
-  int16x8_t a0 = vld1q_s16(src_diff);
-  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
-  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
-  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
-  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
-  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
-  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
-  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  // Skip the second transpose because it is not required.
-
-  vst1q_s16(coeff + 0, a0);
-  vst1q_s16(coeff + 8, a1);
-  vst1q_s16(coeff + 16, a2);
-  vst1q_s16(coeff + 24, a3);
-  vst1q_s16(coeff + 32, a4);
-  vst1q_s16(coeff + 40, a5);
-  vst1q_s16(coeff + 48, a6);
-  vst1q_s16(coeff + 56, a7);
-}
-
-void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int i;
-
-  /* Rearrange 16x16 to 8x32 and remove stride.
-   * Top left first. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
-  /* Top right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
-  /* Bottom left. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
-  /* Bottom right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
-
-  for (i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = vld1q_s16(coeff + 0);
-    const int16x8_t a1 = vld1q_s16(coeff + 64);
-    const int16x8_t a2 = vld1q_s16(coeff + 128);
-    const int16x8_t a3 = vld1q_s16(coeff + 192);
-
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
-
-    const int16x8_t c0 = vaddq_s16(b0, b2);
-    const int16x8_t c1 = vaddq_s16(b1, b3);
-    const int16x8_t c2 = vsubq_s16(b0, b2);
-    const int16x8_t c3 = vsubq_s16(b1, b3);
-
-    vst1q_s16(coeff + 0, c0);
-    vst1q_s16(coeff + 64, c1);
-    vst1q_s16(coeff + 128, c2);
-    vst1q_s16(coeff + 192, c3);
-
-    coeff += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c
deleted file mode 100644
index 196b2a890..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, j, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  for (d1 = d2 = dest, i = 0; i < 4; i++) {
-    for (j = 0; j < 2; j++) {
-      d2u64 = vld1_u64((const uint64_t *)d1);
-      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-      d4u64 = vld1_u64((const uint64_t *)d1);
-      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
-      d1 += dest_stride;
-
-      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
-      d2 += dest_stride;
-      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
-      d2 += dest_stride;
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm
deleted file mode 100644
index d01c4bc03..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm
+++ /dev/null
@@ -1,201 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-
-    EXPORT  |aom_idct16x16_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
-;                                    int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct16x16_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asr              r0, r0, #6                ; >> 6
-
-    vdup.s16         q0, r0                    ; duplicate a1
-    mov              r0, #8
-    sub              r2, #8
-
-    ; load destination data row0 - row3
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row4 - row7
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row8 - row11
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row12 - row15
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |aom_idct16x16_1_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c
deleted file mode 100644
index b4cb7a0cd..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c
+++ /dev/null
@@ -1,1295 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
-                                int16x8_t *q10s16, int16x8_t *q11s16,
-                                int16x8_t *q12s16, int16x8_t *q13s16,
-                                int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-  *q12s16 = vcombine_s16(d17s16, d25s16);
-  *q13s16 = vcombine_s16(d19s16, d27s16);
-  *q14s16 = vcombine_s16(d21s16, d29s16);
-  *q15s16 = vcombine_s16(d23s16, d31s16);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
-  q1x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
-  q2x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
-  q3x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
-
-  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-  *q8s16 = q0x2s16.val[0];
-  *q9s16 = q0x2s16.val[1];
-  *q10s16 = q1x2s16.val[0];
-  *q11s16 = q1x2s16.val[1];
-  *q12s16 = q2x2s16.val[0];
-  *q13s16 = q2x2s16.val[1];
-  *q14s16 = q3x2s16.val[0];
-  *q15s16 = q3x2s16.val[1];
-  return;
-}
-
-void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
-                                      int output_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
-
-  // stage 3
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d18s16, d1s16);
-  q6s32 = vmull_s16(d19s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q5s32, 14);
-  d15s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  q2s32 = vmull_s16(d26s16, d2s16);
-  q3s32 = vmull_s16(d27s16, d2s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q15s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-  d10s16 = vqrshrn_n_s32(q2s32, 14);
-  d11s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q15s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d30s16);
-  q11s32 = vmull_s16(d17s16, d30s16);
-  q0s32 = vmull_s16(d24s16, d30s16);
-  q1s32 = vmull_s16(d25s16, d30s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  q3s32 = vaddq_s32(q2s32, q0s32);
-  q12s32 = vaddq_s32(q11s32, q1s32);
-  q13s32 = vsubq_s32(q2s32, q0s32);
-  q1s32 = vsubq_s32(q11s32, q1s32);
-
-  d16s16 = vqrshrn_n_s32(q3s32, 14);
-  d17s16 = vqrshrn_n_s32(q12s32, 14);
-  d18s16 = vqrshrn_n_s32(q13s32, 14);
-  d19s16 = vqrshrn_n_s32(q1s32, 14);
-  q8s16 = vcombine_s16(d16s16, d17s16);
-  q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q0s32 = vmull_s16(d20s16, d31s16);
-  q1s32 = vmull_s16(d21s16, d31s16);
-  q12s32 = vmull_s16(d20s16, d30s16);
-  q13s32 = vmull_s16(d21s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-  d22s16 = vqrshrn_n_s32(q0s32, 14);
-  d23s16 = vqrshrn_n_s32(q1s32, 14);
-  d20s16 = vqrshrn_n_s32(q12s32, 14);
-  d21s16 = vqrshrn_n_s32(q13s32, 14);
-  q10s16 = vcombine_s16(d20s16, d21s16);
-  q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q15s16 = vaddq_s16(q6s16, q7s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  // stage 5
-  q0s16 = vaddq_s16(q8s16, q11s16);
-  q1s16 = vaddq_s16(q9s16, q10s16);
-  q2s16 = vsubq_s16(q9s16, q10s16);
-  q3s16 = vsubq_s16(q8s16, q11s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q11s32 = vmull_s16(d26s16, d16s16);
-  q12s32 = vmull_s16(d27s16, d16s16);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q13s32 = vsubq_s32(q10s32, q12s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d11s16 = vqrshrn_n_s32(q13s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q8s16 = vaddq_s16(q0s16, q15s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d16u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
-
-void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride) {
-  uint8_t *d;
-  uint8x8_t d12u8, d13u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64;
-  int64x1_t d12s64, d13s64;
-  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
-
-  // stage 3
-  d12s16 = vdup_n_s16((int16_t)cospi_30_64);
-  d13s16 = vdup_n_s16((int16_t)cospi_2_64);
-
-  q2s32 = vmull_s16(d16s16, d12s16);
-  q3s32 = vmull_s16(d17s16, d12s16);
-  q1s32 = vmull_s16(d16s16, d13s16);
-  q4s32 = vmull_s16(d17s16, d13s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
-  d0s16 = vqrshrn_n_s32(q2s32, 14);
-  d1s16 = vqrshrn_n_s32(q3s32, 14);
-  d14s16 = vqrshrn_n_s32(q1s32, 14);
-  d15s16 = vqrshrn_n_s32(q4s32, 14);
-  q0s16 = vcombine_s16(d0s16, d1s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_14_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_18_64);
-
-  q2s32 = vmull_s16(d24s16, d30s16);
-  q3s32 = vmull_s16(d25s16, d30s16);
-  q4s32 = vmull_s16(d24s16, d31s16);
-  q5s32 = vmull_s16(d25s16, d31s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q2s32, 14);
-  d3s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q4s32, 14);
-  d13s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_22_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_10_64);
-
-  q11s32 = vmull_s16(d20s16, d30s16);
-  q12s32 = vmull_s16(d21s16, d30s16);
-  q4s32 = vmull_s16(d20s16, d31s16);
-  q5s32 = vmull_s16(d21s16, d31s16);
-
-  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d11s16 = vqrshrn_n_s32(q5s32, 14);
-  d10s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_6_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_26_64);
-
-  q10s32 = vmull_s16(d28s16, d30s16);
-  q11s32 = vmull_s16(d29s16, d30s16);
-  q12s32 = vmull_s16(d28s16, d31s16);
-  q13s32 = vmull_s16(d29s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q11s32, 14);
-  d8s16 = vqrshrn_n_s32(q12s32, 14);
-  d9s16 = vqrshrn_n_s32(q13s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 3
-  q9s16 = vsubq_s16(q0s16, q1s16);
-  q0s16 = vaddq_s16(q0s16, q1s16);
-  q10s16 = vsubq_s16(q3s16, q2s16);
-  q11s16 = vaddq_s16(q2s16, q3s16);
-  q12s16 = vaddq_s16(q4s16, q5s16);
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q6s16, q7s16);
-
-  // stage 4
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q2s32 = vmull_s16(d18s16, d31s16);
-  q3s32 = vmull_s16(d19s16, d31s16);
-  q4s32 = vmull_s16(d28s16, d31s16);
-  q5s32 = vmull_s16(d29s16, d31s16);
-
-  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q3s32, 14);
-  d2s16 = vqrshrn_n_s32(q4s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q3s16 = q11s16;
-  q4s16 = q12s16;
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q11s32 = vmull_s16(d26s16, d30s16);
-  q12s32 = vmull_s16(d27s16, d30s16);
-  q8s32 = vmull_s16(d20s16, d30s16);
-  q9s32 = vmull_s16(d21s16, d30s16);
-
-  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q11s32, 14);
-  d5s16 = vqrshrn_n_s32(q12s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
-
-  // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q10s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q10s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 7
-  if (skip_adding != 0) {
-    d = dest;
-    // load the data in pass1
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += dest_stride;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    // store the data  out 8,9,10,11,12,13,14,15
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q2s16 = vrshrq_n_s16(q2s16, 6);
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q3s16 = vrshrq_n_s16(q3s16, 6);
-    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += dest_stride;
-    q14s16 = vrshrq_n_s16(q14s16, 6);
-    q14u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += dest_stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    q15s16 = vrshrq_n_s16(q15s16, 6);
-    q15u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-  } else {  // skip_adding_dest
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q1s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q11s16 = vld1q_s16(pass1Output);
-    pass1Output += 8;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
-  }
-  return;
-}
-
-void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
-                                     int output_stride) {
-  int16x4_t d4s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  // stage 3
-  q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2));
-  q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2));
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-  // stage 4
-  q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2));
-  d4s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-  q9s32 = vmull_s16(d14s16, d4s16);
-  q10s32 = vmull_s16(d15s16, d4s16);
-  q12s32 = vmull_s16(d9s16, d4s16);
-  q11s32 = vmull_s16(d8s16, d4s16);
-
-  q15s32 = vsubq_s32(q10s32, q12s32);
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d11s16 = vqrshrn_n_s32(q15s32, 14);
-  d10s16 = vqrshrn_n_s32(q6s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q2s16 = vaddq_s16(q8s16, q7s16);
-  q9s16 = vaddq_s16(q8s16, q6s16);
-  q10s16 = vaddq_s16(q8s16, q5s16);
-  q11s16 = vaddq_s16(q8s16, q4s16);
-  q12s16 = vsubq_s16(q8s16, q4s16);
-  q13s16 = vsubq_s16(q8s16, q5s16);
-  q14s16 = vsubq_s16(q8s16, q6s16);
-  q15s16 = vsubq_s16(q8s16, q7s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
-  d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
-  d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
-  d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  // store the data
-  output_stride >>= 1;  // output_stride / 2, out is int16_t
-  vst1_u64((uint64_t *)out, d4u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d20u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d21u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d22u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d23u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d24u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += output_stride;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
-
-void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-  (void)skip_adding;
-  (void)dest;
-  (void)dest_stride;
-
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  // stage 3
-  q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2));
-  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2));
-  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-  q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2));
-  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
-  // stage 4
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-  d6s16 = vget_low_s16(q3s16);
-  d7s16 = vget_high_s16(q3s16);
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q12s32 = vmull_s16(d14s16, d31s16);
-  q5s32 = vmull_s16(d15s16, d31s16);
-  q2s32 = vmull_s16(d0s16, d31s16);
-  q11s32 = vmull_s16(d1s16, d31s16);
-
-  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-  d2s16 = vqrshrn_n_s32(q12s32, 14);
-  d3s16 = vqrshrn_n_s32(q5s32, 14);
-  d12s16 = vqrshrn_n_s32(q2s32, 14);
-  d13s16 = vqrshrn_n_s32(q11s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q10s32 = vmull_s16(d8s16, d30s16);
-  q13s32 = vmull_s16(d9s16, d30s16);
-  q8s32 = vmull_s16(d6s16, d30s16);
-  q9s32 = vmull_s16(d7s16, d30s16);
-
-  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-  d4s16 = vqrshrn_n_s32(q10s32, 14);
-  d5s16 = vqrshrn_n_s32(q13s32, 14);
-  d10s16 = vqrshrn_n_s32(q8s32, 14);
-  d11s16 = vqrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
-
-  // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q0s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vqrshrn_n_s32(q5s32, 14);
-  d5s16 = vqrshrn_n_s32(q6s32, 14);
-  d10s16 = vqrshrn_n_s32(q0s32, 14);
-  d11s16 = vqrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vqrshrn_n_s32(q10s32, 14);
-  d7s16 = vqrshrn_n_s32(q4s32, 14);
-  d8s16 = vqrshrn_n_s32(q13s32, 14);
-  d9s16 = vqrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 7
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q15s16);
-  q13s16 = vaddq_s16(q1s16, q14s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q14s16 = vsubq_s16(q1s16, q14s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q10s16, q5s16);
-  q13s16 = vaddq_s16(q11s16, q4s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q4s16 = vsubq_s16(q11s16, q4s16);
-  q5s16 = vsubq_s16(q10s16, q5s16);
-
-  q0s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q1s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q12s16 = vaddq_s16(q0s16, q3s16);
-  q13s16 = vaddq_s16(q1s16, q2s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q2s16 = vsubq_s16(q1s16, q2s16);
-  q3s16 = vsubq_s16(q0s16, q3s16);
-
-  q10s16 = vld1q_s16(pass1Output);
-  pass1Output += 8;
-  q11s16 = vld1q_s16(pass1Output);
-  q12s16 = vaddq_s16(q10s16, q9s16);
-  q13s16 = vaddq_s16(q11s16, q8s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q8s16 = vsubq_s16(q11s16, q8s16);
-  q9s16 = vsubq_s16(q10s16, q9s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
-  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
-  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
-  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
-  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  vst1_u64((uint64_t *)out, d16u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d4u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d6u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d7u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d8u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d9u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d10u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d11u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d31u64);
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm
deleted file mode 100644
index 4a8f8f183..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm
+++ /dev/null
@@ -1,1182 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_idct16x16_256_add_neon_pass1|
-    EXPORT  |aom_idct16x16_256_add_neon_pass2|
-    EXPORT  |aom_idct16x16_10_add_neon_pass1|
-    EXPORT  |aom_idct16x16_10_add_neon_pass2|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input,
-;                                          int16_t *output, int output_stride)
-;
-; r0  int16_t input
-; r1  int16_t *output
-; r2  int  output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_256_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0xc00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r12, #0x3e00
-    add             r12, #0xc5
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r12                   ; duplicate cospi_4_64
-
-    ; preloading to avoid stall
-    ; generate cospi_12_64 = 13623
-    mov             r3, #0x3500
-    add             r3, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r12, #0x2300
-    add             r12, #0x8e
-
-    ; step2[4] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; step2[4] * cospi_4_64
-    vmull.s16       q5, d18, d1
-    vmull.s16       q6, d19, d1
-
-    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
-    vmlal.s16       q5, d30, d0
-    vmlal.s16       q6, d31, d0
-
-    vdup.16         d2, r3                    ; duplicate cospi_12_64
-    vdup.16         d3, r12                   ; duplicate cospi_20_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q5, #14              ; >> 14
-    vqrshrn.s32     d15, q6, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; step2[5] * cospi_12_64
-    vmull.s16       q2, d26, d2
-    vmull.s16       q3, d27, d2
-
-    ; step2[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q15, d27, d3
-
-    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q2, d22, d3
-    vmlsl.s16       q3, d23, d3
-
-    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q15, d23, d2
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q2, #14              ; >> 14
-    vqrshrn.s32     d11, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q15, #14             ; >> 14
-
-    ; stage 4
-    vdup.16         d30, r3                   ; cospi_16_64
-
-    ; step1[0] * cospi_16_64
-    vmull.s16       q2, d16, d30
-    vmull.s16       q11, d17, d30
-
-    ; step1[1] * cospi_16_64
-    vmull.s16       q0, d24, d30
-    vmull.s16       q1, d25, d30
-
-    ; generate cospi_8_64 = 15137
-    mov             r3, #0x3b00
-    add             r3, #0x21
-
-    vdup.16         d30, r12                  ; duplicate cospi_24_64
-    vdup.16         d31, r3                   ; duplicate cospi_8_64
-
-    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
-    vadd.s32        q3, q2, q0
-    vadd.s32        q12, q11, q1
-
-    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
-    vsub.s32        q13, q2, q0
-    vsub.s32        q1, q11, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d16, q3, #14              ; >> 14
-    vqrshrn.s32     d17, q12, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d18, q13, #14             ; >> 14
-    vqrshrn.s32     d19, q1, #14              ; >> 14
-
-    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-    ; step1[2] * cospi_8_64
-    vmull.s16       q0, d20, d31
-    vmull.s16       q1, d21, d31
-
-    ; step1[2] * cospi_24_64
-    vmull.s16       q12, d20, d30
-    vmull.s16       q13, d21, d30
-
-    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q0, d28, d30
-    vmlal.s16       q1, d29, d30
-
-    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q12, d28, d31
-    vmlsl.s16       q13, d29, d31
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d22, q0, #14              ; >> 14
-    vqrshrn.s32     d23, q1, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d20, q12, #14             ; >> 14
-    vqrshrn.s32     d21, q13, #14             ; >> 14
-
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
-    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
-
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    ; stage 5
-    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
-    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
-    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
-    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
-
-    vdup.16         d16, r3;                  ; duplicate cospi_16_64
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q6, q9, q11
-    vsub.s32        q13, q10, q12
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-    vqrshrn.s32     d11, q13, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q8, q0, q15                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {d16}, [r1], r2
-    vst1.64         {d17}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
-
-    bx              lr
-    ENDP  ; |aom_idct16x16_256_add_neon_pass1|
-
-;void aom_idct16x16_256_add_neon_pass2(int16_t *src,
-;                                        int16_t *output,
-;                                        int16_t *pass1Output,
-;                                        int16_t skip_adding,
-;                                        uint8_t *dest,
-;                                        int dest_stride)
-;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_256_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-    ; generate  cospi_30_64 = 1606
-    mov             r3, #0x0600
-    add             r3, #0x46
-
-    ; generate cospi_2_64  = 16305
-    mov             r12, #0x3f00
-    add             r12, #0xb1
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d12, r3                   ; duplicate cospi_30_64
-    vdup.16         d13, r12                  ; duplicate cospi_2_64
-
-    ; preloading to avoid stall
-    ; generate cospi_14_64 = 12665
-    mov             r3, #0x3100
-    add             r3, #0x79
-
-    ; generate cospi_18_64 = 10394
-    mov             r12, #0x2800
-    add             r12, #0x9a
-
-    ; step1[8] * cospi_30_64
-    vmull.s16       q2, d16, d12
-    vmull.s16       q3, d17, d12
-
-    ; step1[8] * cospi_2_64
-    vmull.s16       q1, d16, d13
-    vmull.s16       q4, d17, d13
-
-    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
-    vmlsl.s16       q2, d30, d13
-    vmlsl.s16       q3, d31, d13
-
-    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
-    vmlal.s16       q1, d30, d12
-    vmlal.s16       q4, d31, d12
-
-    vdup.16         d30, r3                   ; duplicate cospi_14_64
-    vdup.16         d31, r12                  ; duplicate cospi_18_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d0, q2, #14               ; >> 14
-    vqrshrn.s32     d1, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d14, q1, #14              ; >> 14
-    vqrshrn.s32     d15, q4, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_10_64 = 14449
-    mov             r12, #0x3800
-    add             r12, #0x71
-
-    ; step1[9] * cospi_14_64
-    vmull.s16       q2, d24, d30
-    vmull.s16       q3, d25, d30
-
-    ; step1[9] * cospi_18_64
-    vmull.s16       q4, d24, d31
-    vmull.s16       q5, d25, d31
-
-    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
-    vmlsl.s16       q2, d22, d31
-    vmlsl.s16       q3, d23, d31
-
-    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
-    vmlal.s16       q4, d22, d30
-    vmlal.s16       q5, d23, d30
-
-    vdup.16         d30, r3                   ; duplicate cospi_22_64
-    vdup.16         d31, r12                  ; duplicate cospi_10_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q2, #14               ; >> 14
-    vqrshrn.s32     d3, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q4, #14              ; >> 14
-    vqrshrn.s32     d13, q5, #14              ; >> 14
-
-    ; step1[10] * cospi_22_64
-    vmull.s16       q11, d20, d30
-    vmull.s16       q12, d21, d30
-
-    ; step1[10] * cospi_10_64
-    vmull.s16       q4, d20, d31
-    vmull.s16       q5, d21, d31
-
-    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
-    vmlsl.s16       q11, d26, d31
-    vmlsl.s16       q12, d27, d31
-
-    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
-    vmlal.s16       q4, d26, d30
-    vmlal.s16       q5, d27, d30
-
-    ; preloading to avoid stall
-    ; generate cospi_6_64 = 15679
-    mov             r3, #0x3d00
-    add             r3, #0x3f
-
-    ; generate cospi_26_64 = 4756
-    mov             r12, #0x1200
-    add             r12, #0x94
-
-    vdup.16         d30, r3                   ; duplicate cospi_6_64
-    vdup.16         d31, r12                  ; duplicate cospi_26_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d11, q5, #14              ; >> 14
-    vqrshrn.s32     d10, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_6_64
-    vmull.s16       q10, d28, d30
-    vmull.s16       q11, d29, d30
-
-    ; step1[11] * cospi_26_64
-    vmull.s16       q12, d28, d31
-    vmull.s16       q13, d29, d31
-
-    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
-    vmlsl.s16       q10, d18, d31
-    vmlsl.s16       q11, d19, d31
-
-    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
-    vmlal.s16       q12, d18, d30
-    vmlal.s16       q13, d19, d30
-
-    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
-    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q11, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q12, #14              ; >> 14
-    vqrshrn.s32     d9, q13, #14              ; >> 14
-
-    ; stage 3
-    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
-    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
-    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
-    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
-    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
-    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
-
-    ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
-
-    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d18, d31
-    vmull.s16       q3, d19, d31
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q4, d28, d31
-    vmull.s16       q5, d29, d31
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d28, d30
-    vmlal.s16       q3, d29, d30
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q4, d18, d30
-    vmlsl.s16       q5, d19, d30
-
-    rsb             r12, #0
-    vdup.16         d30, r12                  ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q4, #14               ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
-
-    vmov.s16        q3, q11
-    vmov.s16        q4, q12
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q11, d26, d30
-    vmull.s16       q12, d27, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d20, d30
-    vmull.s16       q9, d21, d30
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlsl.s16       q11, d20, d31
-    vmlsl.s16       q12, d21, d31
-
-    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d26, d31
-    vmlal.s16       q9, d27, d31
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d4, q11, #14              ; >> 14
-    vqrshrn.s32     d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q10, q3, q0
-    vadd.s32        q4, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q10, #14             ; >> 14
-    vqrshrn.s32     d11, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1Output stride
-    ldr              r3, [sp]                 ; load skip_adding
-    cmp              r3, #0                   ; check if need adding dest data
-    beq              skip_adding_dest
-
-    ldr              r7, [sp, #28]            ; dest used to save element 0-7
-    mov              r9, r7                   ; save dest pointer for later use
-    ldr              r8, [sp, #32]            ; load dest_stride
-
-    ; stage 7
-    ; load the data in pass1
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * dest_stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
-    vaddw.u8        q8, q8, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q8                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q9, q9, #6
-    vaddw.u8        q9, q9, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q9                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q2, q2, #6
-    vaddw.u8        q2, q2, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q2                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q3, q3, #6
-    vaddw.u8        q3, q3, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q3                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q4, q4, #6
-    vaddw.u8        q4, q4, d12               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q4                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q5, q5, #6
-    vaddw.u8        q5, q5, d13               ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q5                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q14, q14, #6
-    vaddw.u8        q14, q14, d12             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d12, q14                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q15, q15, #6
-    vaddw.u8        q15, q15, d13             ; + dest[j * dest_stride + i]
-    vqmovun.s16     d13, q15                  ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    b               end_idct16x16_pass2
-
-skip_adding_dest
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |aom_idct16x16_256_add_neon_pass2|
-
-;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input,
-;                                             int16_t *output, int output_stride)
-;
-; r0  int16_t input
-; r1  int16_t *output
-; r2  int  output_stride)
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_10_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-    ; generate  cospi_28_64*2 = 6392
-    mov             r3, #0x1800
-    add             r3, #0xf8
-
-    ; generate cospi_4_64*2  = 32138
-    mov             r12, #0x7d00
-    add             r12, #0x8a
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
-    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
-    ; the constant will change this to >> 14.
-    ; dct_const_round_shift(step2[4] * cospi_28_64);
-    vqrdmulh.s16    q4, q9, q0
-
-    ; preloading to avoid stall
-    ; generate cospi_16_64*2 = 23170
-    mov             r3, #0x5a00
-    add             r3, #0x82
-
-    ; dct_const_round_shift(step2[4] * cospi_4_64);
-    vqrdmulh.s16    q7, q9, q1
-
-    ; stage 4
-    vdup.16         q1, r3                    ; cospi_16_64*2
-
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
-
-    vdup.16         d4, r3;                   ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(step1[0] * cospi_16_64)
-    vqrdmulh.s16    q8, q8, q1
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d14, d4
-    vmull.s16       q10, d15, d4
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q12, d9, d4
-    vmull.s16       q11, d8, d4
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q15, q10, q12
-    vsub.s32        q6, q9, q11
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d11, q15, #14             ; >> 14
-    vqrshrn.s32     d10, q6, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
-    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
-    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {d4}, [r1], r2
-    vst1.64         {d5}, [r1], r2
-    vst1.64         {d18}, [r1], r2
-    vst1.64         {d19}, [r1], r2
-    vst1.64         {d20}, [r1], r2
-    vst1.64         {d21}, [r1], r2
-    vst1.64         {d22}, [r1], r2
-    vst1.64         {d23}, [r1], r2
-    vst1.64         {d24}, [r1], r2
-    vst1.64         {d25}, [r1], r2
-    vst1.64         {d26}, [r1], r2
-    vst1.64         {d27}, [r1], r2
-    vst1.64         {d28}, [r1], r2
-    vst1.64         {d29}, [r1], r2
-    vst1.64         {d30}, [r1], r2
-    vst1.64         {d31}, [r1], r2
-
-    bx              lr
-    ENDP  ; |aom_idct16x16_10_add_neon_pass1|
-
-;void aom_idct16x16_10_add_neon_pass2(int16_t *src,
-;                                           int16_t *output,
-;                                           int16_t *pass1Output,
-;                                           int16_t skip_adding,
-;                                           uint8_t *dest,
-;                                           int dest_stride)
-;
-; r0  int16_t *src
-; r1  int16_t *output,
-; r2  int16_t *pass1Output,
-; r3  int16_t skip_adding,
-; r4  uint8_t *dest,
-; r5  int dest_stride)
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|aom_idct16x16_10_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-    ; generate 2*cospi_30_64 = 3212
-    mov             r3, #0xc00
-    add             r3, #0x8c
-
-    ; generate 2*cospi_2_64  = 32610
-    mov             r12, #0x7f00
-    add             r12, #0x62
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
-
-    ; dct_const_round_shift(step1[8] * cospi_30_64)
-    vqrdmulh.s16    q0, q8, q6
-
-    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
-
-    ; dct_const_round_shift(step1[8] * cospi_2_64)
-    vqrdmulh.s16    q7, q8, q6
-
-    ; preloading to avoid stall
-    ; generate 2*cospi_26_64 = 9512
-    mov             r12, #0x2500
-    add             r12, #0x28
-    rsb             r12, #0
-    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
-
-    ; generate 2*cospi_6_64 = 31358
-    mov             r3, #0x7a00
-    add             r3, #0x7e
-    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
-
-    ; dct_const_round_shift(- step1[12] * cospi_26_64)
-    vqrdmulh.s16    q3, q9, q15
-
-    ; dct_const_round_shift(step1[12] * cospi_6_64)
-    vqrdmulh.s16    q4, q9, q14
-
-    ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q12, d14, d31
-    vmull.s16       q5, d15, d31
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d0, d31
-    vmull.s16       q11, d1, d31
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q12, d0, d30
-    vmlsl.s16       q5, d1, d30
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d14, d30
-    vmlal.s16       q11, d15, d30
-
-    rsb              r12, #0
-    vdup.16          d30, r12                 ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d2, q12, #14              ; >> 14
-    vqrshrn.s32     d3, q5, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d12, q2, #14              ; >> 14
-    vqrshrn.s32     d13, q11, #14             ; >> 14
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q10, d8, d30
-    vmull.s16       q13, d9, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d6, d30
-    vmull.s16       q9, d7, d30
-
-    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
-    vmlsl.s16       q10, d6, d31
-    vmlsl.s16       q13, d7, d31
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d8, d31
-    vmlal.s16       q9, d9, d31
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q10, #14              ; >> 14
-    vqrshrn.s32     d5, q13, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q8, #14              ; >> 14
-    vqrshrn.s32     d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q0, q3, q0
-    vadd.s32        q1, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vqrshrn.s32     d4, q5, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vqrshrn.s32     d10, q0, #14              ; >> 14
-    vqrshrn.s32     d11, q1, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d6, q10, #14              ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
-    vqrshrn.s32     d8, q13, #14              ; >> 14
-    vqrshrn.s32     d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1Output stride
-    ldr              r3, [sp]                 ; load skip_adding
-
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct10_16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |aom_idct16x16_10_add_neon_pass2|
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index db0d4905b..000000000
--- a/third_party/aom/aom_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_dsp_common.h"
-
-void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
-                                      int output_stride);
-void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
-                                      int16_t *pass1Output, int16_t skip_adding,
-                                      uint8_t *dest, int dest_stride);
-void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
-                                     int output_stride);
-void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
-                                     int16_t *pass1Output, int16_t skip_adding,
-                                     uint8_t *dest, int dest_stride);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void aom_push_neon(int64_t *store);
-extern void aom_pop_neon(int64_t *store);
-#endif  // HAVE_NEON_ASM
-
-void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  aom_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                   dest, dest_stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
-                                   pass1_output, 0, dest, dest_stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  aom_pop_neon(store_reg);
-#endif
-
-  return;
-}
-
-void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  aom_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                  dest, dest_stride);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, dest_stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, dest_stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  aom_pop_neon(store_reg);
-#endif
-
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c
deleted file mode 100644
index 547567c5b..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  *q8u8 = vld1q_u8(d);
-  d += d_stride;
-  *q9u8 = vld1q_u8(d);
-  d += d_stride;
-  *q10u8 = vld1q_u8(d);
-  d += d_stride;
-  *q11u8 = vld1q_u8(d);
-  d += d_stride;
-  *q12u8 = vld1q_u8(d);
-  d += d_stride;
-  *q13u8 = vld1q_u8(d);
-  d += d_stride;
-  *q14u8 = vld1q_u8(d);
-  d += d_stride;
-  *q15u8 = vld1q_u8(d);
-  return;
-}
-
-static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
-  return;
-}
-
-static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
-                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
-                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
-                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
-                                 uint8x16_t *q15u8) {
-  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
-  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
-  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
-  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
-  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
-  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
-  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
-  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
-  return;
-}
-
-static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
-                           uint8x16_t *q9u8, uint8x16_t *q10u8,
-                           uint8x16_t *q11u8, uint8x16_t *q12u8,
-                           uint8x16_t *q13u8, uint8x16_t *q14u8,
-                           uint8x16_t *q15u8) {
-  vst1q_u8(d, *q8u8);
-  d += d_stride;
-  vst1q_u8(d, *q9u8);
-  d += d_stride;
-  vst1q_u8(d, *q10u8);
-  d += d_stride;
-  vst1q_u8(d, *q11u8);
-  d += d_stride;
-  vst1q_u8(d, *q12u8);
-  d += d_stride;
-  vst1q_u8(d, *q13u8);
-  d += d_stride;
-  vst1q_u8(d, *q14u8);
-  d += d_stride;
-  vst1q_u8(d, *q15u8);
-  return;
-}
-
-void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int i, j, dest_stride8;
-  uint8_t *d;
-  int16_t a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  dest_stride8 = dest_stride * 8;
-  if (a1 >= 0) {  // diff_positive_32_32
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
-    }
-  } else {  // diff_negative_32_32
-    a1 = -a1;
-    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
-    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
-      d = dest;
-      for (j = 0; j < 4; j++) {
-        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                      &q14u8, &q15u8);
-        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
-                &q14u8, &q15u8);
-        d += dest_stride8;
-      }
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm
deleted file mode 100644
index b04df2d0b..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm
+++ /dev/null
@@ -1,147 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-    EXPORT  |aom_idct32x32_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ;TODO(hkuang): put the following macros in a seperate
-    ;file so other idct function could also use them.
-    MACRO
-    LD_16x8          $src, $stride
-    vld1.8           {q8}, [$src], $stride
-    vld1.8           {q9}, [$src], $stride
-    vld1.8           {q10}, [$src], $stride
-    vld1.8           {q11}, [$src], $stride
-    vld1.8           {q12}, [$src], $stride
-    vld1.8           {q13}, [$src], $stride
-    vld1.8           {q14}, [$src], $stride
-    vld1.8           {q15}, [$src], $stride
-    MEND
-
-    MACRO
-    ADD_DIFF_16x8    $diff
-    vqadd.u8         q8, q8, $diff
-    vqadd.u8         q9, q9, $diff
-    vqadd.u8         q10, q10, $diff
-    vqadd.u8         q11, q11, $diff
-    vqadd.u8         q12, q12, $diff
-    vqadd.u8         q13, q13, $diff
-    vqadd.u8         q14, q14, $diff
-    vqadd.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    SUB_DIFF_16x8    $diff
-    vqsub.u8         q8, q8, $diff
-    vqsub.u8         q9, q9, $diff
-    vqsub.u8         q10, q10, $diff
-    vqsub.u8         q11, q11, $diff
-    vqsub.u8         q12, q12, $diff
-    vqsub.u8         q13, q13, $diff
-    vqsub.u8         q14, q14, $diff
-    vqsub.u8         q15, q15, $diff
-    MEND
-
-    MACRO
-    ST_16x8          $dst, $stride
-    vst1.8           {q8}, [$dst], $stride
-    vst1.8           {q9}, [$dst], $stride
-    vst1.8           {q10},[$dst], $stride
-    vst1.8           {q11},[$dst], $stride
-    vst1.8           {q12},[$dst], $stride
-    vst1.8           {q13},[$dst], $stride
-    vst1.8           {q14},[$dst], $stride
-    vst1.8           {q15},[$dst], $stride
-    MEND
-
-;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
-;                              int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-
-|aom_idct32x32_1_add_neon| PROC
-    push             {lr}
-    pld              [r1]
-    add              r3, r1, #16               ; r3 dest + 16 for second loop
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asrs             r0, r0, #6                ; >> 6
-    bge              diff_positive_32_32
-
-diff_negative_32_32
-    neg              r0, r0
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_negative_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    SUB_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_negative_32_32_loop
-    pop              {pc}
-
-diff_positive_32_32
-    usat             r0, #8, r0
-    vdup.u8          q0, r0
-    mov              r0, #4
-
-diff_positive_32_32_loop
-    sub              r0, #1
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-
-    LD_16x8          r1, r2
-    ADD_DIFF_16x8    q0
-    ST_16x8          r12, r2
-    cmp              r0, #2
-    moveq            r1, r3
-    moveq            r12, r3
-    cmp              r0, #0
-    bne              diff_positive_32_32_loop
-    pop              {pc}
-
-    ENDP             ; |aom_idct32x32_1_add_neon|
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c
deleted file mode 100644
index a7562c7d5..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c
+++ /dev/null
@@ -1,686 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-
-#define LOAD_FROM_TRANSPOSED(prev, first, second) \
-  q14s16 = vld1q_s16(trans_buf + first * 8);      \
-  q13s16 = vld1q_s16(trans_buf + second * 8);
-
-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
-  qA = vld1q_s16(out + first * 32);                   \
-  qB = vld1q_s16(out + second * 32);
-
-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
-  vst1q_s16(out + first * 32, qA);                   \
-  vst1q_s16(out + second * 32, qB);
-
-#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
-  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                  int stride, int16x8_t q6s16,
-                                                  int16x8_t q7s16,
-                                                  int16x8_t q8s16,
-                                                  int16x8_t q9s16) {
-  int16x4_t d8s16, d9s16, d10s16, d11s16;
-
-  d8s16 = vld1_s16((int16_t *)p1);
-  p1 += stride;
-  d11s16 = vld1_s16((int16_t *)p2);
-  p2 -= stride;
-  d9s16 = vld1_s16((int16_t *)p1);
-  d10s16 = vld1_s16((int16_t *)p2);
-
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q8s16 = vrshrq_n_s16(q8s16, 6);
-  q9s16 = vrshrq_n_s16(q9s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
-
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
-  q8s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
-  q9s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
-
-  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
-  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
-  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-
-  vst1_s16((int16_t *)p1, d9s16);
-  p1 -= stride;
-  vst1_s16((int16_t *)p2, d10s16);
-  p2 += stride;
-  vst1_s16((int16_t *)p1, d8s16);
-  vst1_s16((int16_t *)p2, d11s16);
-  return;
-}
-
-#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
-  ;                                           \
-  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                   int stride, int16x8_t q4s16,
-                                                   int16x8_t q5s16,
-                                                   int16x8_t q6s16,
-                                                   int16x8_t q7s16) {
-  int16x4_t d4s16, d5s16, d6s16, d7s16;
-
-  d4s16 = vld1_s16((int16_t *)p1);
-  p1 += stride;
-  d7s16 = vld1_s16((int16_t *)p2);
-  p2 -= stride;
-  d5s16 = vld1_s16((int16_t *)p1);
-  d6s16 = vld1_s16((int16_t *)p2);
-
-  q5s16 = vrshrq_n_s16(q5s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q4s16 = vrshrq_n_s16(q4s16, 6);
-
-  q5s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
-  q4s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
-
-  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
-  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
-
-  vst1_s16((int16_t *)p1, d5s16);
-  p1 -= stride;
-  vst1_s16((int16_t *)p2, d6s16);
-  p2 += stride;
-  vst1_s16((int16_t *)p2, d7s16);
-  vst1_s16((int16_t *)p1, d4s16);
-  return;
-}
-
-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
-  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
-                                int16_t first_const, int16_t second_const,
-                                int16x8_t *qAs16, int16x8_t *qBs16) {
-  int16x4_t d30s16, d31s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
-  int16x4_t dCs16, dDs16, dAs16, dBs16;
-
-  dCs16 = vget_low_s16(q14s16);
-  dDs16 = vget_high_s16(q14s16);
-  dAs16 = vget_low_s16(q13s16);
-  dBs16 = vget_high_s16(q13s16);
-
-  d30s16 = vdup_n_s16(first_const);
-  d31s16 = vdup_n_s16(second_const);
-
-  q8s32 = vmull_s16(dCs16, d30s16);
-  q10s32 = vmull_s16(dAs16, d31s16);
-  q9s32 = vmull_s16(dDs16, d30s16);
-  q11s32 = vmull_s16(dBs16, d31s16);
-  q12s32 = vmull_s16(dCs16, d31s16);
-
-  q8s32 = vsubq_s32(q8s32, q10s32);
-  q9s32 = vsubq_s32(q9s32, q11s32);
-
-  q10s32 = vmull_s16(dDs16, d31s16);
-  q11s32 = vmull_s16(dAs16, d30s16);
-  q15s32 = vmull_s16(dBs16, d30s16);
-
-  q11s32 = vaddq_s32(q12s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q15s32);
-
-  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
-  return;
-}
-
-static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
-  int16_t *in;
-  int i;
-  const int stride = 32;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  for (i = 0; i < 4; i++, input += 8) {
-    in = input;
-    q8s16 = vld1q_s16(in);
-    in += stride;
-    q9s16 = vld1q_s16(in);
-    in += stride;
-    q10s16 = vld1q_s16(in);
-    in += stride;
-    q11s16 = vld1q_s16(in);
-    in += stride;
-    q12s16 = vld1q_s16(in);
-    in += stride;
-    q13s16 = vld1q_s16(in);
-    in += stride;
-    q14s16 = vld1q_s16(in);
-    in += stride;
-    q15s16 = vld1q_s16(in);
-
-    d16s16 = vget_low_s16(q8s16);
-    d17s16 = vget_high_s16(q8s16);
-    d18s16 = vget_low_s16(q9s16);
-    d19s16 = vget_high_s16(q9s16);
-    d20s16 = vget_low_s16(q10s16);
-    d21s16 = vget_high_s16(q10s16);
-    d22s16 = vget_low_s16(q11s16);
-    d23s16 = vget_high_s16(q11s16);
-    d24s16 = vget_low_s16(q12s16);
-    d25s16 = vget_high_s16(q12s16);
-    d26s16 = vget_low_s16(q13s16);
-    d27s16 = vget_high_s16(q13s16);
-    d28s16 = vget_low_s16(q14s16);
-    d29s16 = vget_high_s16(q14s16);
-    d30s16 = vget_low_s16(q15s16);
-    d31s16 = vget_high_s16(q15s16);
-
-    q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-    q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-    q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-    q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-    q12s16 = vcombine_s16(d17s16, d25s16);
-    q13s16 = vcombine_s16(d19s16, d27s16);
-    q14s16 = vcombine_s16(d21s16, d29s16);
-    q15s16 = vcombine_s16(d23s16, d31s16);
-
-    q0x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
-    q1x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
-    q2x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
-    q3x2s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
-
-    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-    vst1q_s16(t_buf, q0x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q0x2s16.val[1]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q1x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q1x2s16.val[1]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q2x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q2x2s16.val[1]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q3x2s16.val[0]);
-    t_buf += 8;
-    vst1q_s16(t_buf, q3x2s16.val[1]);
-    t_buf += 8;
-  }
-  return;
-}
-
-static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
-                                             int16x8_t q3s16, int16x8_t q6s16,
-                                             int16x8_t q7s16, int16x8_t q8s16,
-                                             int16x8_t q9s16, int16x8_t q10s16,
-                                             int16x8_t q11s16, int16x8_t q12s16,
-                                             int16x8_t q13s16, int16x8_t q14s16,
-                                             int16x8_t q15s16) {
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
-  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
-  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
-
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
-  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
-  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
-
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
-  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
-  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
-
-  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
-
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
-  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
-  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
-  return;
-}
-
-static INLINE void idct32_bands_end_2nd_pass(
-    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
-    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
-    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
-    int16x8_t q14s16, int16x8_t q15s16) {
-  uint8_t *r6 = dest + 31 * stride;
-  uint8_t *r7 = dest /* +  0 * stride*/;
-  uint8_t *r9 = dest + 15 * stride;
-  uint8_t *r10 = dest + 16 * stride;
-  int str2 = stride << 1;
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
-
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
-
-  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
-
-  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
-
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
-
-  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
-
-  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
-
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
-
-  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
-
-  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
-
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-
-  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  return;
-}
-
-void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
-  int i, idct32_pass_loop;
-  int16_t trans_buf[32 * 8];
-  int16_t pass1[32 * 32];
-  int16_t pass2[32 * 32];
-  int16_t *out;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-
-  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
-       idct32_pass_loop++,
-      input = pass1,  // the input of pass2 is the result of pass1
-       out = pass2) {
-    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
-      idct32_transpose_pair(input, trans_buf);
-
-      // -----------------------------------------
-      // BLOCK A: 16-19,28-31
-      // -----------------------------------------
-      // generate 16,17,30,31
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(0, 1, 31)
-      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(31, 17, 15)
-      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
-      // part of stage 2
-      q4s16 = vaddq_s16(q0s16, q1s16);
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q6s16 = vaddq_s16(q2s16, q3s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
-
-      // generate 18,19,28,29
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(15, 9, 23)
-      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(23, 25, 7)
-      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
-      // part of stage 2
-      q13s16 = vsubq_s16(q3s16, q2s16);
-      q3s16 = vaddq_s16(q3s16, q2s16);
-      q14s16 = vsubq_s16(q1s16, q0s16);
-      q2s16 = vaddq_s16(q1s16, q0s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
-      // part of stage 4
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q15s16 = vaddq_s16(q6s16, q3s16);
-      q13s16 = vsubq_s16(q5s16, q0s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
-      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
-      // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
-      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
-      // part of stage 4
-      q13s16 = vsubq_s16(q4s16, q2s16);
-      q14s16 = vsubq_s16(q6s16, q3s16);
-      // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
-      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
-
-      // -----------------------------------------
-      // BLOCK B: 20-23,24-27
-      // -----------------------------------------
-      // generate 20,21,26,27
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(7, 5, 27)
-      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(27, 21, 11)
-      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
-      // part of stage 2
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-
-      // generate 22,23,24,25
-      // part of stage 1
-      LOAD_FROM_TRANSPOSED(11, 13, 19)
-      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(19, 29, 3)
-      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
-      // part of stage 2
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
-      // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
-      // part of stage 4
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q11s16 = vaddq_s16(q5s16, q0s16);
-      q12s16 = vaddq_s16(q6s16, q2s16);
-      q15s16 = vaddq_s16(q4s16, q3s16);
-      // part of stage 6
-      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q11s16);
-      q9s16 = vaddq_s16(q13s16, q10s16);
-      q13s16 = vsubq_s16(q13s16, q10s16);
-      q11s16 = vsubq_s16(q14s16, q11s16);
-      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
-      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
-      q8s16 = vsubq_s16(q9s16, q12s16);
-      q10s16 = vaddq_s16(q14s16, q15s16);
-      q14s16 = vsubq_s16(q14s16, q15s16);
-      q12s16 = vaddq_s16(q9s16, q12s16);
-      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
-      // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
-      q13s16 = q11s16;
-      q14s16 = q8s16;
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
-      // part of stage 4
-      q14s16 = vsubq_s16(q5s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q2s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      q13s16 = vsubq_s16(q4s16, q3s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
-      // part of stage 6
-      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q1s16);
-      q9s16 = vaddq_s16(q13s16, q6s16);
-      q13s16 = vsubq_s16(q13s16, q6s16);
-      q1s16 = vsubq_s16(q14s16, q1s16);
-      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
-      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
-      q14s16 = vsubq_s16(q8s16, q5s16);
-      q10s16 = vaddq_s16(q8s16, q5s16);
-      q11s16 = vaddq_s16(q9s16, q0s16);
-      q0s16 = vsubq_s16(q9s16, q0s16);
-      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
-      // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
-      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
-      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
-
-      // -----------------------------------------
-      // BLOCK C: 8-10,11-15
-      // -----------------------------------------
-      // generate 8,9,14,15
-      // part of stage 2
-      LOAD_FROM_TRANSPOSED(3, 2, 30)
-      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(30, 18, 14)
-      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
-      // part of stage 3
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
-      // part of stage 4
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
-
-      // generate 10,11,12,13
-      // part of stage 2
-      LOAD_FROM_TRANSPOSED(14, 10, 22)
-      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(22, 26, 6)
-      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
-      // part of stage 3
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
-      // part of stage 4
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
-      // part of stage 5
-      q8s16 = vaddq_s16(q0s16, q5s16);
-      q9s16 = vaddq_s16(q1s16, q7s16);
-      q13s16 = vsubq_s16(q1s16, q7s16);
-      q14s16 = vsubq_s16(q3s16, q4s16);
-      q10s16 = vaddq_s16(q3s16, q4s16);
-      q15s16 = vaddq_s16(q2s16, q6s16);
-      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
-      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
-      // part of stage 6
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
-      q13s16 = vsubq_s16(q0s16, q5s16);
-      q14s16 = vsubq_s16(q2s16, q6s16);
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
-
-      // -----------------------------------------
-      // BLOCK D: 0-3,4-7
-      // -----------------------------------------
-      // generate 4,5,6,7
-      // part of stage 3
-      LOAD_FROM_TRANSPOSED(6, 4, 28)
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(28, 20, 12)
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
-      // part of stage 4
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
-      // part of stage 5
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-
-      // generate 0,1,2,3
-      // part of stage 4
-      LOAD_FROM_TRANSPOSED(12, 0, 16)
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(16, 8, 24)
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
-      // part of stage 5
-      q4s16 = vaddq_s16(q7s16, q6s16);
-      q7s16 = vsubq_s16(q7s16, q6s16);
-      q6s16 = vsubq_s16(q5s16, q14s16);
-      q5s16 = vaddq_s16(q5s16, q14s16);
-      // part of stage 6
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q3s16);
-      q10s16 = vaddq_s16(q6s16, q1s16);
-      q11s16 = vaddq_s16(q7s16, q0s16);
-      q12s16 = vsubq_s16(q7s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q1s16);
-      q14s16 = vsubq_s16(q5s16, q3s16);
-      q15s16 = vsubq_s16(q4s16, q2s16);
-      // part of stage 7
-      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
-      q2s16 = vaddq_s16(q8s16, q1s16);
-      q3s16 = vaddq_s16(q9s16, q0s16);
-      q4s16 = vsubq_s16(q9s16, q0s16);
-      q5s16 = vsubq_s16(q8s16, q1s16);
-      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
-      q8s16 = vaddq_s16(q4s16, q1s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q6s16 = vsubq_s16(q5s16, q0s16);
-      q7s16 = vsubq_s16(q4s16, q1s16);
-
-      if (idct32_pass_loop == 0) {
-        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                                  q10s16, q11s16, q12s16, q13s16, q14s16,
-                                  q15s16);
-      } else {
-        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
-                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
-                                  q14s16, q15s16);
-        dest += 8;
-      }
-    }
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm
deleted file mode 100644
index e7793fb16..000000000
--- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm
+++ /dev/null
@@ -1,1302 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-;TODO(cd): adjust these constant to be able to use vqdmulh for faster
-;          dct_const_round_shift(a * b) within butterfly calculations.
-cospi_1_64  EQU 16364
-cospi_2_64  EQU 16305
-cospi_3_64  EQU 16207
-cospi_4_64  EQU 16069
-cospi_5_64  EQU 15893
-cospi_6_64  EQU 15679
-cospi_7_64  EQU 15426
-cospi_8_64  EQU 15137
-cospi_9_64  EQU 14811
-cospi_10_64 EQU 14449
-cospi_11_64 EQU 14053
-cospi_12_64 EQU 13623
-cospi_13_64 EQU 13160
-cospi_14_64 EQU 12665
-cospi_15_64 EQU 12140
-cospi_16_64 EQU 11585
-cospi_17_64 EQU 11003
-cospi_18_64 EQU 10394
-cospi_19_64 EQU  9760
-cospi_20_64 EQU  9102
-cospi_21_64 EQU  8423
-cospi_22_64 EQU  7723
-cospi_23_64 EQU  7005
-cospi_24_64 EQU  6270
-cospi_25_64 EQU  5520
-cospi_26_64 EQU  4756
-cospi_27_64 EQU  3981
-cospi_28_64 EQU  3196
-cospi_29_64 EQU  2404
-cospi_30_64 EQU  1606
-cospi_31_64 EQU   804
-
-
-    EXPORT  |aom_idct32x32_1024_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY
-
-    ; --------------------------------------------------------------------------
-    ; Load from transposed_buffer
-    ;   q13 = transposed_buffer[first_offset]
-    ;   q14 = transposed_buffer[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   transposed_buffer must be passed in. use 0 for first use.
-    MACRO
-    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
-    ; address calculation with proper stride and loading
-    add r0, #($first_offset  - $prev_offset )*8*2
-    vld1.s16        {q14}, [r0]
-    add r0, #($second_offset - $first_offset)*8*2
-    vld1.s16        {q13}, [r0]
-    ; (used) two registers (q14, q13)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Load from output (used as temporary storage)
-    ;   reg1 = output[first_offset]
-    ;   reg2 = output[second_offset]
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and loading
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vld1.s16        {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vld1.s16        {$reg2}, [r1]
-    ; (used) two registers ($reg1, $reg2)
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Store into output (sometimes as as temporary storage)
-    ;   output[first_offset] = reg1
-    ;   output[second_offset] = reg2
-    ;   for proper address calculation, the last offset used when manipulating
-    ;   output, whether reading or storing) must be passed in. use 0 for first
-    ;   use.
-    MACRO
-    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
-    ; address calculation with proper stride and storing
-    add r1, #($first_offset  - $prev_offset )*32*2
-    vst1.16 {$reg1}, [r1]
-    add r1, #($second_offset - $first_offset)*32*2
-    vst1.16 {$reg2}, [r1]
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]
-    vst1.16         {d11}, [r9]
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q6-q9 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d8}, [r10], r2
-    vld1.s16        {d11}, [r9], r11
-    vld1.s16        {d9}, [r10]
-    vld1.s16        {d10}, [r9]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q8, q8, #6
-    vrshr.s16       q9, q9, #6
-    vrshr.s16       q6, q6, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q7, q7, d9
-    vaddw.u8        q8, q8, d10
-    vaddw.u8        q9, q9, d11
-    vaddw.u8        q6, q6, d8
-    ; clip pixel
-    vqmovun.s16     d9,  q7
-    vqmovun.s16     d10, q8
-    vqmovun.s16     d11, q9
-    vqmovun.s16     d8,  q6
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d9}, [r10], r11
-    vst1.16         {d10}, [r9], r2
-    vst1.16         {d8}, [r10]!
-    vst1.16         {d11}, [r9]!
-    ; update pointers (by dest_stride * 2)
-    sub r9,  r9,  r2, lsl #1
-    add r10, r10, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]
-    vst1.16         {d4}, [r7]
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Combine-add results with current destination content
-    ;   q4-q7 contain the results (out[j * 32 + 0-31])
-    MACRO
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; load dest[j * dest_stride + 0-31]
-    vld1.s16        {d4}, [r7], r2
-    vld1.s16        {d7}, [r6], r11
-    vld1.s16        {d5}, [r7]
-    vld1.s16        {d6}, [r6]
-    ; ROUND_POWER_OF_TWO
-    vrshr.s16       q5, q5, #6
-    vrshr.s16       q6, q6, #6
-    vrshr.s16       q7, q7, #6
-    vrshr.s16       q4, q4, #6
-    ; add to dest[j * dest_stride + 0-31]
-    vaddw.u8        q5, q5, d5
-    vaddw.u8        q6, q6, d6
-    vaddw.u8        q7, q7, d7
-    vaddw.u8        q4, q4, d4
-    ; clip pixel
-    vqmovun.s16     d5, q5
-    vqmovun.s16     d6, q6
-    vqmovun.s16     d7, q7
-    vqmovun.s16     d4, q4
-    ; store back into dest[j * dest_stride + 0-31]
-    vst1.16         {d5}, [r7], r11
-    vst1.16         {d6}, [r6], r2
-    vst1.16         {d7}, [r6]!
-    vst1.16         {d4}, [r7]!
-    ; update pointers (by dest_stride * 2)
-    sub r6, r6, r2, lsl #1
-    add r7, r7, r2, lsl #1
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    ; TODO(cd): have special case to re-use constants when they are similar for
-    ;           consecutive butterflies
-    ; TODO(cd): have special case when both constants are the same, do the
-    ;           additions/subtractions before the multiplies.
-    ; generate the constants
-    ;   generate scalar constants
-    mov             r8,  #$first_constant  & 0xFF00
-    mov             r12, #$second_constant & 0xFF00
-    add             r8,  #$first_constant  & 0x00FF
-    add             r12, #$second_constant & 0x00FF
-    ;   generate vector constants
-    vdup.16         d30, r8
-    vdup.16         d31, r12
-    ; (used) two for inputs (regA-regD), one for constants (q15)
-    ; do some multiplications (ordered for maximum latency hiding)
-    vmull.s16 q8,  $regC, d30
-    vmull.s16 q10, $regA, d31
-    vmull.s16 q9,  $regD, d30
-    vmull.s16 q11, $regB, d31
-    vmull.s16 q12, $regC, d31
-    ; (used) five for intermediate (q8-q12), one for constants (q15)
-    ; do some addition/subtractions (to get back two register)
-    vsub.s32  q8, q8, q10
-    vsub.s32  q9, q9, q11
-    ; do more multiplications (ordered for maximum latency hiding)
-    vmull.s16 q10, $regD, d31
-    vmull.s16 q11, $regA, d30
-    vmull.s16 q15, $regB, d30
-    ; (used) six for intermediate (q8-q12, q15)
-    ; do more addition/subtractions
-    vadd.s32  q11, q12, q11
-    vadd.s32  q10, q10, q15
-    ; (used) four for intermediate (q8-q11)
-    ; dct_const_round_shift
-    vqrshrn.s32 $reg1, q8,  #14
-    vqrshrn.s32 $reg2, q9,  #14
-    vqrshrn.s32 $reg3, q11, #14
-    vqrshrn.s32 $reg4, q10, #14
-    ; (used) two for results, well four d registers
-    MEND
-    ; --------------------------------------------------------------------------
-    ; Touches q8-q12, q15 (q13-q14 are preserved)
-    ; valid output registers are anything but q8-q11
-    MACRO
-    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
-    MEND
-    ; --------------------------------------------------------------------------
-
-;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
-;
-;   r0  int16_t *input,
-;   r1  uint8_t *dest,
-;   r2  int dest_stride)
-; loop counters
-;   r4  bands loop counter
-;   r5  pass loop counter
-;   r8  transpose loop counter
-; combine-add pointers
-;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
-;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
-;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
-;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
-
-|aom_idct32x32_1024_add_neon| PROC
-    ; This function does one pass of idct32x32 transform.
-    ;
-    ; This is done by transposing the input and then doing a 1d transform on
-    ; columns. In the first pass, the transposed columns are the original
-    ; rows. In the second pass, after the transposition, the colums are the
-    ; original columns.
-    ; The 1d transform is done by looping over bands of eight columns (the
-    ; idct32_bands loop). For each band, the transform input transposition
-    ; is done on demand, one band of four 8x8 matrices at a time. The four
-    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
-    push  {r4-r11}
-    vpush {d8-d15}
-    ; stack operation
-    ; internal buffer used to transpose 8 lines into before transforming them
-    ;   int16_t transpose_buffer[32 * 8];
-    ;   at sp + [4096, 4607]
-    ; results of the first pass (transpose and transform rows)
-    ;   int16_t pass1[32 * 32];
-    ;   at sp + [0, 2047]
-    ; results of the second pass (transpose and transform columns)
-    ;   int16_t pass2[32 * 32];
-    ;   at sp + [2048, 4095]
-    sub sp, sp, #512+2048+2048
-
-    ; r6  = dest + 31 * dest_stride
-    ; r7  = dest +  0 * dest_stride
-    ; r9  = dest + 15 * dest_stride
-    ; r10 = dest + 16 * dest_stride
-    rsb r6,  r2, r2, lsl #5
-    rsb r9,  r2, r2, lsl #4
-    add r10, r1, r2, lsl #4
-    mov r7, r1
-    add r6, r6, r1
-    add r9, r9, r1
-    ; r11 = -dest_stride
-    neg r11, r2
-    ; r3 = input
-    mov r3, r0
-    ; parameters for first pass
-      ; r0 = transpose_buffer[32 * 8]
-    add r0, sp, #4096
-      ; r1 = pass1[32 * 32]
-    mov r1, sp
-
-    mov r5, #0          ; initialize pass loop counter
-idct32_pass_loop
-    mov r4, #4          ; initialize bands loop counter
-idct32_bands_loop
-    mov r8, #2          ; initialize transpose loop counter
-idct32_transpose_pair_loop
-    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
-    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
-    ; adjusted to 32 because of the two post-increments.
-    vld1.s16        {q8},  [r3]!
-    vld1.s16        {q0},  [r3]!
-    add r3, #32
-    vld1.s16        {q9},  [r3]!
-    vld1.s16        {q1},  [r3]!
-    add r3, #32
-    vld1.s16        {q10}, [r3]!
-    vld1.s16        {q2},  [r3]!
-    add r3, #32
-    vld1.s16        {q11}, [r3]!
-    vld1.s16        {q3},  [r3]!
-    add r3, #32
-    vld1.s16        {q12}, [r3]!
-    vld1.s16        {q4},  [r3]!
-    add r3, #32
-    vld1.s16        {q13}, [r3]!
-    vld1.s16        {q5},  [r3]!
-    add r3, #32
-    vld1.s16        {q14}, [r3]!
-    vld1.s16        {q6},  [r3]!
-    add r3, #32
-    vld1.s16        {q15}, [r3]!
-    vld1.s16        {q7},  [r3]!
-
-    ; Transpose the two 8x8 16bit data matrices.
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vswp            d1,  d8
-    vswp            d7,  d14
-    vswp            d5,  d12
-    vswp            d3,  d10
-    vtrn.32         q8,  q10
-    vtrn.32         q9,  q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.32         q0,  q2
-    vtrn.32         q1,  q3
-    vtrn.32         q4,  q6
-    vtrn.32         q5,  q7
-    vtrn.16         q8,  q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    vtrn.16         q0,  q1
-    vtrn.16         q2,  q3
-    vtrn.16         q4,  q5
-    vtrn.16         q6,  q7
-
-    ; Store both matrices after each other. There is a stride of 32, which
-    ; adjusts to nothing because of the post-increments.
-    vst1.16        {q8},  [r0]!
-    vst1.16        {q9},  [r0]!
-    vst1.16        {q10}, [r0]!
-    vst1.16        {q11}, [r0]!
-    vst1.16        {q12}, [r0]!
-    vst1.16        {q13}, [r0]!
-    vst1.16        {q14}, [r0]!
-    vst1.16        {q15}, [r0]!
-    vst1.16        {q0},  [r0]!
-    vst1.16        {q1},  [r0]!
-    vst1.16        {q2},  [r0]!
-    vst1.16        {q3},  [r0]!
-    vst1.16        {q4},  [r0]!
-    vst1.16        {q5},  [r0]!
-    vst1.16        {q6},  [r0]!
-    vst1.16        {q7},  [r0]!
-
-    ; increment pointers by adjusted stride (not necessary for r0/out)
-    ;   go back by 7*32 for the seven lines moved fully by read and add
-    ;   go back by 32 for the eigth line only read
-    ;   advance by 16*2 to go the next pair
-    sub r3,  r3,  #7*32*2 + 32 - 16*2
-    ; transpose pair loop processing
-    subs r8, r8, #1
-    bne idct32_transpose_pair_loop
-
-    ; restore r0/input to its original value
-    sub r0, r0, #32*8*2
-
-    ; Instead of doing the transforms stage by stage, it is done by loading
-    ; some input values and doing as many stages as possible to minimize the
-    ; storing/loading of intermediate results. To fit within registers, the
-    ; final coefficients are cut into four blocks:
-    ; BLOCK A: 16-19,28-31
-    ; BLOCK B: 20-23,24-27
-    ; BLOCK C: 8-10,11-15
-    ; BLOCK D: 0-3,4-7
-    ; Blocks A and C are straight calculation through the various stages. In
-    ; block B, further calculations are performed using the results from
-    ; block A. In block D, further calculations are performed using the results
-    ; from block C and then the final calculations are done using results from
-    ; block A and B which have been combined at the end of block B.
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK A: 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; generate 16,17,30,31
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
-    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
-    ;step1b[16][i] = dct_const_round_shift(temp1);
-    ;step1b[31][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 0, 1, 31
-    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
-    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
-    ;step1b[17][i] = dct_const_round_shift(temp1);
-    ;step1b[30][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 31, 17, 15
-    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[16] =  step1b[16][i] + step1b[17][i];
-    ;step2[17] =  step1b[16][i] - step1b[17][i];
-    ;step2[30] = -step1b[30][i] + step1b[31][i];
-    ;step2[31] =  step1b[30][i] + step1b[31][i];
-    vadd.s16  q4, q0, q1
-    vsub.s16  q13, q0, q1
-    vadd.s16  q6, q2, q3
-    vsub.s16  q14, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
-    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
-    ;step3[17] = dct_const_round_shift(temp1);
-    ;step3[30] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; generate 18,19,28,29
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
-    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
-    ;step1b[18][i] = dct_const_round_shift(temp1);
-    ;step1b[29][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 15, 9, 23
-    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
-    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
-    ;step1b[19][i] = dct_const_round_shift(temp1);
-    ;step1b[28][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 23, 25, 7
-    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[18] = -step1b[18][i] + step1b[19][i];
-    ;step2[19] =  step1b[18][i] + step1b[19][i];
-    ;step2[28] =  step1b[28][i] + step1b[29][i];
-    ;step2[29] =  step1b[28][i] - step1b[29][i];
-    vsub.s16  q13, q3, q2
-    vadd.s16  q3,  q3, q2
-    vsub.s16  q14, q1, q0
-    vadd.s16  q2,  q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
-    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
-    ;step3[29] = dct_const_round_shift(temp1);
-    ;step3[18] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
-    ; --------------------------------------------------------------------------
-    ; combine 16-19,28-31
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[16] = step1b[16][i] + step1b[19][i];
-    ;step1[17] = step1b[17][i] + step1b[18][i];
-    ;step1[18] = step1b[17][i] - step1b[18][i];
-    ;step1[29] = step1b[30][i] - step1b[29][i];
-    ;step1[30] = step1b[30][i] + step1b[29][i];
-    ;step1[31] = step1b[31][i] + step1b[28][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q0
-    vadd.s16  q10, q7, q1
-    vadd.s16  q15, q6, q3
-    vsub.s16  q13, q5, q0
-    vsub.s16  q14, q7, q1
-    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
-    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
-    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
-    ;step2[18] = dct_const_round_shift(temp1);
-    ;step2[29] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
-    STORE_IN_OUTPUT 30, 29, 18, q1, q0
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[19] = step1b[16][i] - step1b[19][i];
-    ;step1[28] = step1b[31][i] - step1b[28][i];
-    vsub.s16  q13, q4, q2
-    vsub.s16  q14, q6, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
-    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
-    ;step2[19] = dct_const_round_shift(temp1);
-    ;step2[28] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
-    STORE_IN_OUTPUT 18, 19, 28, q4, q6
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK B: 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; generate 20,21,26,27
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
-    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
-    ;step1b[20][i] = dct_const_round_shift(temp1);
-    ;step1b[27][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 7, 5, 27
-    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
-    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
-    ;step1b[21][i] = dct_const_round_shift(temp1);
-    ;step1b[26][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 27, 21, 11
-    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[20] =  step1b[20][i] + step1b[21][i];
-    ;step2[21] =  step1b[20][i] - step1b[21][i];
-    ;step2[26] = -step1b[26][i] + step1b[27][i];
-    ;step2[27] =  step1b[26][i] + step1b[27][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
-    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
-    ;step3[21] = dct_const_round_shift(temp1);
-    ;step3[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 22,23,24,25
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
-    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
-    ;step1b[22][i] = dct_const_round_shift(temp1);
-    ;step1b[25][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 11, 13, 19
-    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 1
-    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
-    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
-    ;step1b[23][i] = dct_const_round_shift(temp1);
-    ;step1b[24][i] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 19, 29, 3
-    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;step2[22] = -step1b[22][i] + step1b[23][i];
-    ;step2[23] =  step1b[22][i] + step1b[23][i];
-    ;step2[24] =  step1b[24][i] + step1b[25][i];
-    ;step2[25] =  step1b[24][i] - step1b[25][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
-    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
-    ;step3[25] = dct_const_round_shift(temp1);
-    ;step3[22] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 20-23,24-27
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[22] = step1b[22][i] + step1b[21][i];
-    ;step1[23] = step1b[23][i] + step1b[20][i];
-    vadd.s16  q10, q7, q1
-    vadd.s16  q11, q5, q0
-    ;step1[24] = step1b[24][i] + step1b[27][i];
-    ;step1[25] = step1b[25][i] + step1b[26][i];
-    vadd.s16  q12, q6, q2
-    vadd.s16  q15, q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[16] = step1b[16][i] + step1b[23][i];
-    ;step3[17] = step1b[17][i] + step1b[22][i];
-    ;step3[22] = step1b[17][i] - step1b[22][i];
-    ;step3[23] = step1b[16][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
-    vadd.s16  q8,  q14, q11
-    vadd.s16  q9,  q13, q10
-    vsub.s16  q13, q13, q10
-    vsub.s16  q11, q14, q11
-    STORE_IN_OUTPUT 17, 17, 16, q9, q8
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[24] = step1b[31][i] - step1b[24][i];
-    ;step3[25] = step1b[30][i] - step1b[25][i];
-    ;step3[30] = step1b[30][i] + step1b[25][i];
-    ;step3[31] = step1b[31][i] + step1b[24][i];
-    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
-    vsub.s16  q8,  q9,  q12
-    vadd.s16  q10, q14, q15
-    vsub.s16  q14, q14, q15
-    vadd.s16  q12, q9,  q12
-    STORE_IN_OUTPUT 31, 30, 31, q10, q12
-    ; --------------------------------------------------------------------------
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpush {q8}  ; [24]
-    vpush {q11} ; [23]
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
-    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
-    ;step1[22] = dct_const_round_shift(temp1);
-    ;step1[25] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 31, 25, 22, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
-    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
-    ;step1[23] = dct_const_round_shift(temp1);
-    ;step1[24] = dct_const_round_shift(temp2);
-    ; TODO(cd) do some register allocation change to remove these push/pop
-    vpop  {q13} ; [23]
-    vpop  {q14} ; [24]
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 22, 24, 23, q14, q13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[20] = step1b[23][i] - step1b[20][i];
-    ;step1[27] = step1b[24][i] - step1b[27][i];
-    vsub.s16  q14, q5, q0
-    vsub.s16  q13, q6, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
-    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
-    ;step2[27] = dct_const_round_shift(temp1);
-    ;step2[20] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[21] = step1b[22][i] - step1b[21][i];
-    ;step1[26] = step1b[25][i] - step1b[26][i];
-    vsub.s16  q14,  q7, q1
-    vsub.s16  q13,  q4, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
-    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
-    ;step2[26] = dct_const_round_shift(temp1);
-    ;step2[21] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[18] = step1b[18][i] + step1b[21][i];
-    ;step3[19] = step1b[19][i] + step1b[20][i];
-    ;step3[20] = step1b[19][i] - step1b[20][i];
-    ;step3[21] = step1b[18][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
-    vadd.s16  q8,  q14, q1
-    vadd.s16  q9,  q13, q6
-    vsub.s16  q13, q13, q6
-    vsub.s16  q1,  q14, q1
-    STORE_IN_OUTPUT 19, 18, 19, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[27] = step1b[28][i] - step1b[27][i];
-    ;step3[28] = step1b[28][i] + step1b[27][i];
-    ;step3[29] = step1b[29][i] + step1b[26][i];
-    ;step3[26] = step1b[29][i] - step1b[26][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
-    vsub.s16  q14, q8, q5
-    vadd.s16  q10, q8, q5
-    vadd.s16  q11, q9, q0
-    vsub.s16  q0, q9, q0
-    STORE_IN_OUTPUT 29, 28, 29, q10, q11
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
-    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
-    ;step1[20] = dct_const_round_shift(temp1);
-    ;step1[27] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
-    STORE_IN_OUTPUT 29, 20, 27, q13, q14
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
-    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
-    ;step1[21] = dct_const_round_shift(temp1);
-    ;step1[26] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
-    STORE_IN_OUTPUT 27, 21, 26, q1, q0
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK C: 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; generate 8,9,14,15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
-    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
-    ;step2[8] = dct_const_round_shift(temp1);
-    ;step2[15] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 3, 2, 30
-    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
-    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
-    ;step2[9] = dct_const_round_shift(temp1);
-    ;step2[14] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 30, 18, 14
-    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[8] = step1b[8][i] + step1b[9][i];
-    ;step3[9] = step1b[8][i] - step1b[9][i];
-    ;step3[14] = step1b[15][i] - step1b[14][i];
-    ;step3[15] = step1b[15][i] + step1b[14][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
-    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
-    ;step1[9]  = dct_const_round_shift(temp1);
-    ;step1[14] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 10,11,12,13
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
-    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
-    ;step2[10] = dct_const_round_shift(temp1);
-    ;step2[13] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 14, 10, 22
-    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 2
-    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
-    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
-    ;step2[11] = dct_const_round_shift(temp1);
-    ;step2[12] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 22, 26, 6
-    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;step3[10] = step1b[11][i] - step1b[10][i];
-    ;step3[11] = step1b[11][i] + step1b[10][i];
-    ;step3[12] = step1b[12][i] + step1b[13][i];
-    ;step3[13] = step1b[12][i] - step1b[13][i];
-    vsub.s16  q14, q4, q5
-    vadd.s16  q5, q4, q5
-    vsub.s16  q13, q6, q7
-    vadd.s16  q6, q6, q7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
-    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
-    ;step1[13] = dct_const_round_shift(temp1);
-    ;step1[10] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
-    ; --------------------------------------------------------------------------
-    ; combine 8-10,11-15
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[8]  = step1b[8][i] + step1b[11][i];
-    ;step2[9]  = step1b[9][i] + step1b[10][i];
-    ;step2[10] = step1b[9][i] - step1b[10][i];
-    vadd.s16  q8,  q0, q5
-    vadd.s16  q9,  q1, q7
-    vsub.s16  q13, q1, q7
-    ;step2[13] = step1b[14][i] - step1b[13][i];
-    ;step2[14] = step1b[14][i] + step1b[13][i];
-    ;step2[15] = step1b[15][i] + step1b[12][i];
-    vsub.s16  q14, q3, q4
-    vadd.s16  q10, q3, q4
-    vadd.s16  q15, q2, q6
-    STORE_IN_OUTPUT 26, 8, 15, q8, q15
-    STORE_IN_OUTPUT 15, 9, 14, q9, q10
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
-    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
-    ;step3[10] = dct_const_round_shift(temp1);
-    ;step3[13] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 14, 13, 10, q3, q1
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[11] = step1b[8][i] - step1b[11][i];
-    ;step2[12] = step1b[15][i] - step1b[12][i];
-    vsub.s16  q13, q0, q5
-    vsub.s16  q14,  q2, q6
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
-    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
-    ;step3[11] = dct_const_round_shift(temp1);
-    ;step3[12] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    STORE_IN_OUTPUT 10, 11, 12, q1, q3
-    ; --------------------------------------------------------------------------
-
-
-    ; --------------------------------------------------------------------------
-    ; BLOCK D: 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; generate 4,5,6,7
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
-    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
-    ;step3[4] = dct_const_round_shift(temp1);
-    ;step3[7] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 6, 4, 28
-    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
-    ; --------------------------------------------------------------------------
-    ; part of stage 3
-    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
-    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
-    ;step3[5] = dct_const_round_shift(temp1);
-    ;step3[6] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 28, 20, 12
-    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;step1[4] = step1b[4][i] + step1b[5][i];
-    ;step1[5] = step1b[4][i] - step1b[5][i];
-    ;step1[6] = step1b[7][i] - step1b[6][i];
-    ;step1[7] = step1b[7][i] + step1b[6][i];
-    vsub.s16  q13, q0, q1
-    vadd.s16  q0, q0, q1
-    vsub.s16  q14, q2, q3
-    vadd.s16  q2, q2, q3
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
-    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
-    ;step2[5] = dct_const_round_shift(temp1);
-    ;step2[6] = dct_const_round_shift(temp2);
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
-    ; --------------------------------------------------------------------------
-    ; generate 0,1,2,3
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
-    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
-    ;step1[1] = dct_const_round_shift(temp1);
-    ;step1[0] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 12, 0, 16
-    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
-    ; --------------------------------------------------------------------------
-    ; part of stage 4
-    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
-    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
-    ;step1[2] = dct_const_round_shift(temp1);
-    ;step1[3] = dct_const_round_shift(temp2);
-    LOAD_FROM_TRANSPOSED 16, 8, 24
-    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
-    ; --------------------------------------------------------------------------
-    ; part of stage 5
-    ;step2[0] = step1b[0][i] + step1b[3][i];
-    ;step2[1] = step1b[1][i] + step1b[2][i];
-    ;step2[2] = step1b[1][i] - step1b[2][i];
-    ;step2[3] = step1b[0][i] - step1b[3][i];
-    vadd.s16  q4, q7, q6
-    vsub.s16  q7, q7, q6
-    vsub.s16  q6, q5, q14
-    vadd.s16  q5, q5, q14
-    ; --------------------------------------------------------------------------
-    ; combine 0-3,4-7
-    ; --------------------------------------------------------------------------
-    ; part of stage 6
-    ;step3[0] = step1b[0][i] + step1b[7][i];
-    ;step3[1] = step1b[1][i] + step1b[6][i];
-    ;step3[2] = step1b[2][i] + step1b[5][i];
-    ;step3[3] = step1b[3][i] + step1b[4][i];
-    vadd.s16  q8,  q4, q2
-    vadd.s16  q9,  q5, q3
-    vadd.s16  q10, q6, q1
-    vadd.s16  q11, q7, q0
-    ;step3[4] = step1b[3][i] - step1b[4][i];
-    ;step3[5] = step1b[2][i] - step1b[5][i];
-    ;step3[6] = step1b[1][i] - step1b[6][i];
-    ;step3[7] = step1b[0][i] - step1b[7][i];
-    vsub.s16  q12, q7, q0
-    vsub.s16  q13, q6, q1
-    vsub.s16  q14, q5, q3
-    vsub.s16  q15, q4, q2
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[0] = step1b[0][i] + step1b[15][i];
-    ;step1[1] = step1b[1][i] + step1b[14][i];
-    ;step1[14] = step1b[1][i] - step1b[14][i];
-    ;step1[15] = step1b[0][i] - step1b[15][i];
-    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
-    vadd.s16  q2, q8, q1
-    vadd.s16  q3, q9, q0
-    vsub.s16  q4, q9, q0
-    vsub.s16  q5, q8, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
-    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
-    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
-    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
-    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-
-    cmp r5, #0
-    bgt idct32_bands_end_2nd_pass
-
-idct32_bands_end_1st_pass
-    STORE_IN_OUTPUT 17, 16, 17, q6, q7
-    STORE_IN_OUTPUT 17, 14, 15, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 31, 30, 31, q6, q7
-    STORE_IN_OUTPUT 31,  0,  1, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 19, 18, 19, q6, q7
-    STORE_IN_OUTPUT 19, 12, 13, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 29, 28, 29, q6, q7
-    STORE_IN_OUTPUT 29,  2,  3, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 21, 20, 21, q6, q7
-    STORE_IN_OUTPUT 21, 10, 11, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 27, 26, 27, q6, q7
-    STORE_IN_OUTPUT 27,  4,  5, q4, q5
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_IN_OUTPUT 23, 22, 23, q6, q7
-    STORE_IN_OUTPUT 23, 8, 9, q8, q9
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_IN_OUTPUT 25, 24, 25, q6, q7
-    STORE_IN_OUTPUT 25,  6,  7, q4, q5
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #7*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; parameters for second pass
-    ; the input of pass2 is the result of pass1. we have to remove the offset
-    ;   of 32 columns induced by the above idct32_bands_loop
-    sub r3, r1, #32*2
-      ; r1 = pass2[32 * 32]
-    add r1, sp, #2048
-
-    ; pass loop processing
-    add r5, r5, #1
-    b idct32_pass_loop
-
-idct32_bands_end_2nd_pass
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
-    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
-    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
-    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
-    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[2] = step1b[2][i] + step1b[13][i];
-    ;step1[3] = step1b[3][i] + step1b[12][i];
-    ;step1[12] = step1b[3][i] - step1b[12][i];
-    ;step1[13] = step1b[2][i] - step1b[13][i];
-    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
-    vadd.s16  q2, q10, q1
-    vadd.s16  q3, q11, q0
-    vsub.s16  q4, q11, q0
-    vsub.s16  q5, q10, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
-    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
-    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
-    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
-    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
-    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
-    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
-    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
-    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[4] = step1b[4][i] + step1b[11][i];
-    ;step1[5] = step1b[5][i] + step1b[10][i];
-    ;step1[10] = step1b[5][i] - step1b[10][i];
-    ;step1[11] = step1b[4][i] - step1b[11][i];
-    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
-    vadd.s16  q2, q12, q1
-    vadd.s16  q3, q13, q0
-    vsub.s16  q4, q13, q0
-    vsub.s16  q5, q12, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
-    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
-    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
-    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
-    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
-    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
-    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
-    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
-    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS
-    ; --------------------------------------------------------------------------
-    ; part of stage 7
-    ;step1[6] = step1b[6][i] + step1b[9][i];
-    ;step1[7] = step1b[7][i] + step1b[8][i];
-    ;step1[8] = step1b[7][i] - step1b[8][i];
-    ;step1[9] = step1b[6][i] - step1b[9][i];
-    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
-    vadd.s16  q2, q14, q1
-    vadd.s16  q3, q15, q0
-    vsub.s16  q4, q15, q0
-    vsub.s16  q5, q14, q1
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
-    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
-    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
-    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
-    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
-    vadd.s16  q8, q4, q1
-    vadd.s16  q9, q5, q0
-    vsub.s16  q6, q5, q0
-    vsub.s16  q7, q4, q1
-    STORE_COMBINE_CENTER_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; part of final stage
-    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
-    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
-    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
-    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
-    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
-    vadd.s16  q4, q2, q1
-    vadd.s16  q5, q3, q0
-    vsub.s16  q6, q3, q0
-    vsub.s16  q7, q2, q1
-    STORE_COMBINE_EXTREME_RESULTS_LAST
-    ; --------------------------------------------------------------------------
-    ; restore pointers to their initial indices for next band pass by
-    ;     removing/adding dest_stride * 8. The actual increment by eight
-    ;     is taken care of within the _LAST macros.
-    add r6,  r6,  r2, lsl #3
-    add r9,  r9,  r2, lsl #3
-    sub r7,  r7,  r2, lsl #3
-    sub r10, r10, r2, lsl #3
-
-    ; restore r0 by removing the last offset from the last
-    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
-    sub r0, r0, #24*8*2
-    ; restore r1 by removing the last offset from the last
-    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
-    ; advance by 8 columns => 8*2
-    sub r1, r1, #25*32*2 - 8*2
-    ;   advance by 8 lines (8*32*2)
-    ;   go back by the two pairs from the loop (32*2)
-    add r3, r3, #8*32*2 - 32*2
-
-    ; bands loop processing
-    subs r4, r4, #1
-    bne idct32_bands_loop
-
-    ; stack operation
-    add sp, sp, #512+2048+2048
-    vpop {d8-d15}
-    pop  {r4-r11}
-    bx              lr
-    ENDP  ; |aom_idct32x32_1024_add_neon|
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c
deleted file mode 100644
index 3df7a901b..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d6u8;
-  uint32x2_t d2u32 = vdup_n_u32(0);
-  uint16x8_t q8u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  q0s16 = vdupq_n_s16(a1);
-
-  // dc_only_idct_add
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
-    d1 += dest_stride;
-    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
-    d1 += dest_stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
-    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
-    d2 += dest_stride;
-    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
-    d2 += dest_stride;
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm
deleted file mode 100644
index 6bd733d5d..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm
+++ /dev/null
@@ -1,71 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-
-    EXPORT  |aom_idct4x4_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct4x4_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    vld1.32          {d2[0]}, [r1], r2
-    vld1.32          {d2[1]}, [r1], r2
-    vld1.32          {d4[0]}, [r1], r2
-    vld1.32          {d4[1]}, [r1]
-
-    vaddw.u8         q8, q0, d2                ; dest[x] + a1
-    vaddw.u8         q9, q0, d4
-
-    vqmovun.s16      d6, q8                    ; clip_pixel
-    vqmovun.s16      d7, q9
-
-    vst1.32          {d6[0]}, [r12], r2
-    vst1.32          {d6[1]}, [r12], r2
-    vst1.32          {d7[0]}, [r12], r2
-    vst1.32          {d7[1]}, [r12]
-
-    bx               lr
-    ENDP             ; |aom_idct4x4_1_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c
deleted file mode 100644
index 763be1ab0..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/txfm_common.h"
-
-void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d26u8, d27u8;
-  uint32x2_t d26u32, d27u32;
-  uint16x8_t q8u16, q9u16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
-  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
-  int16x8_t q8s16, q9s16, q13s16, q14s16;
-  int32x4_t q1s32, q13s32, q14s32, q15s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-  uint8_t *d;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
-  d19s16 = vget_low_s16(q9s16);
-
-  d0x2s16 = vtrn_s16(d16s16, d17s16);
-  d1x2s16 = vtrn_s16(d18s16, d19s16);
-  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
-  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
-  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
-  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
-
-  // do the transform on columns
-  // stage 1
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, d22s16);
-  q1s32 = vmull_s16(d17s16, d20s16);
-  q13s32 = vmull_s16(d23s16, d21s16);
-  q14s32 = vmull_s16(d24s16, d21s16);
-
-  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
-  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
-
-  d26s16 = vqrshrn_n_s32(q13s32, 14);
-  d27s16 = vqrshrn_n_s32(q14s32, 14);
-  d29s16 = vqrshrn_n_s32(q15s32, 14);
-  d28s16 = vqrshrn_n_s32(q1s32, 14);
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-
-  // stage 2
-  q8s16 = vaddq_s16(q13s16, q14s16);
-  q9s16 = vsubq_s16(q13s16, q14s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d = dest;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
-  d += dest_stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
-  d += dest_stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  d = dest;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
-  d += dest_stride;
-  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm
deleted file mode 100644
index 127acf614..000000000
--- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm
+++ /dev/null
@@ -1,193 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_idct4x4_16_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct4x4_16_add_neon| PROC
-
-    ; The 2D transform is done with two passes which are actually pretty
-    ; similar. We first transform the rows. This is done by transposing
-    ; the inputs, doing an SIMD column transform (the columns are the
-    ; transposed rows) and then transpose the results (so that it goes back
-    ; in normal/row positions). Then, we transform the columns by doing
-    ; another SIMD column transform.
-    ; So, two passes of a transpose followed by a column transform.
-
-    ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-
-    ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
-
-    ; transpose the input data
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-
-    ; generate constant vectors
-    vdup.16         d20, r0         ; replicate cospi_8_64
-    vdup.16         d21, r3         ; replicate cospi_16_64
-
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    vdup.16         d22, r12        ; replicate cospi_24_64
-
-    ; do the transform on transposed rows
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-    vswp     d18, d19
-
-    ; transpose the results
-    ; 00 01 02 03   d16
-    ; 10 11 12 13   d17
-    ; 20 21 22 23   d18
-    ; 30 31 32 33   d19
-    vtrn.16         d16, d17
-    vtrn.16         d18, d19
-    ; 00 10 02 12   d16
-    ; 01 11 03 13   d17
-    ; 20 30 22 32   d18
-    ; 21 31 23 33   d19
-    vtrn.32         q8, q9
-    ; 00 10 20 30   d16
-    ; 01 11 21 31   d17
-    ; 02 12 22 32   d18
-    ; 03 13 23 33   d19
-
-    ; do the transform on columns
-
-    ; stage 1
-    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
-    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
-
-    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
-    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64
-
-    ; (input[0] + input[2]) * cospi_16_64;
-    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q13, d23, d21
-    vmull.s16 q14, d24, d21
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
-    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
-    vmlsl.s16 q15, d19, d20
-    vmlal.s16 q1,  d19, d22
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q1,  #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16 q8,  q13, q14
-    vsub.s16 q9,  q13, q14
-
-    ; The results are in two registers, one of them being swapped. This will
-    ; be taken care of by loading the 'dest' value in a swapped fashion and
-    ; also storing them in the same swapped fashion.
-    ; temp_out[0, 1] = d16, d17 = q8
-    ; temp_out[2, 3] = d19, d18 = q9 swapped
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16 q8, q8, #4
-    vrshr.s16 q9, q9, #4
-
-    vld1.32 {d26[0]}, [r1], r2
-    vld1.32 {d26[1]}, [r1], r2
-    vld1.32 {d27[1]}, [r1], r2
-    vld1.32 {d27[0]}, [r1]  ; no post-increment
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8 q8, q8, d26
-    vaddw.u8 q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb r2, r2, #0
-    vst1.32 {d27[0]}, [r1], r2
-    vst1.32 {d27[1]}, [r1], r2
-    vst1.32 {d26[1]}, [r1], r2
-    vst1.32 {d26[0]}, [r1]  ; no post-increment
-    bx              lr
-    ENDP  ; |aom_idct4x4_16_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c
deleted file mode 100644
index c7926f9e4..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8x8_t d2u8, d3u8, d30u8, d31u8;
-  uint64x1_t d2u64, d3u64, d4u64, d5u64;
-  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
-  int16x8_t q0s16;
-  uint8_t *d1, *d2;
-  int16_t i, a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-
-  q0s16 = vdupq_n_s16(a1);
-  q0u16 = vreinterpretq_u16_s16(q0s16);
-
-  d1 = d2 = dest;
-  for (i = 0; i < 2; i++) {
-    d2u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d3u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d4u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-    d5u64 = vld1_u64((const uint64_t *)d1);
-    d1 += dest_stride;
-
-    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
-    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
-    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
-    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
-
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
-    d2 += dest_stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
-    d2 += dest_stride;
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm
deleted file mode 100644
index ec07e2053..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm
+++ /dev/null
@@ -1,91 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-
-
-    EXPORT  |aom_idct8x8_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-;                                  int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct8x8_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 5)
-    add              r0, r0, #16               ; + (1 <<((5) - 1))
-    asr              r0, r0, #5                ; >> 5
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    ; load destination data
-    vld1.64          {d2}, [r1], r2
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r2
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r2
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r2
-    vld1.64          {d17}, [r1]
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |aom_idct8x8_1_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c
deleted file mode 100644
index 8ad70862d..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-
-static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
-                                int16x8_t *q10s16, int16x8_t *q11s16,
-                                int16x8_t *q12s16, int16x8_t *q13s16,
-                                int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
-  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
-  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
-  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
-  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
-  *q12s16 = vcombine_s16(d17s16, d25s16);
-  *q13s16 = vcombine_s16(d19s16, d27s16);
-  *q14s16 = vcombine_s16(d21s16, d29s16);
-  *q15s16 = vcombine_s16(d23s16, d31s16);
-
-  q0x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
-  q1x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
-  q2x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
-  q3x2s32 =
-      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
-
-  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
-                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
-  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
-                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
-  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
-                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
-  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
-                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
-
-  *q8s16 = q0x2s16.val[0];
-  *q9s16 = q0x2s16.val[1];
-  *q10s16 = q1x2s16.val[0];
-  *q11s16 = q1x2s16.val[1];
-  *q12s16 = q2x2s16.val[0];
-  *q13s16 = q2x2s16.val[1];
-  *q14s16 = q3x2s16.val[0];
-  *q15s16 = q3x2s16.val[1];
-  return;
-}
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vqrshrn_n_s32(q2s32, 14);
-  d9s16 = vqrshrn_n_s32(q3s32, 14);
-  d10s16 = vqrshrn_n_s32(q5s32, 14);
-  d11s16 = vqrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vqrshrn_n_s32(q2s32, 14);
-  d15s16 = vqrshrn_n_s32(q3s32, 14);
-  d12s16 = vqrshrn_n_s32(q9s32, 14);
-  d13s16 = vqrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  d18s16 = vqrshrn_n_s32(q2s32, 14);
-  d19s16 = vqrshrn_n_s32(q3s32, 14);
-  d22s16 = vqrshrn_n_s32(q13s32, 14);
-  d23s16 = vqrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vqrshrn_n_s32(q2s32, 14);
-  d27s16 = vqrshrn_n_s32(q3s32, 14);
-  d30s16 = vqrshrn_n_s32(q8s32, 14);
-  d31s16 = vqrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-  return;
-}
-
-void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-  return;
-}
-
-void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-  int32x4_t q9s32, q10s32, q11s32, q12s32;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  // First transform rows
-  // stage 1
-  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-
-  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
-
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
-
-  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
-
-  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
-
-  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
-
-  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
-
-  // stage 2 & stage 3 - even half
-  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
-
-  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
-
-  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
-
-  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
-
-  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
-
-  // stage 3 -odd half
-  q0s16 = vaddq_s16(q9s16, q15s16);
-  q1s16 = vaddq_s16(q9s16, q13s16);
-  q2s16 = vsubq_s16(q9s16, q13s16);
-  q3s16 = vsubq_s16(q9s16, q15s16);
-
-  // stage 2 - odd half
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vqrshrn_n_s32(q9s32, 14);
-  d11s16 = vqrshrn_n_s32(q10s32, 14);
-  d12s16 = vqrshrn_n_s32(q11s32, 14);
-  d13s16 = vqrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 4
-  q8s16 = vaddq_s16(q0s16, q7s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q7s16);
-
-  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-               &q15s16);
-
-  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-             &q15s16);
-
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  d1 = d2 = dest;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-
-  q8s16 = q12s16;
-  q9s16 = q13s16;
-  q10s16 = q14s16;
-  q11s16 = q15s16;
-
-  d0u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d1u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d2u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-  d3u64 = vld1_u64((uint64_t *)d1);
-  d1 += dest_stride;
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-  d2 += dest_stride;
-  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-  d2 += dest_stride;
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm
deleted file mode 100644
index f3d5f246d..000000000
--- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm
+++ /dev/null
@@ -1,522 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_idct8x8_64_add_neon|
-    EXPORT  |aom_idct8x8_12_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
-    ; This macro will touch q0-q7 registers and use them as buffer during
-    ; calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14              ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14              ; >> 14
-    vqrshrn.s32     d23, q15, #14              ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14              ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct8x8_64_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    IDCT8x8_1D
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |aom_idct8x8_64_add_neon|
-
-;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride)
-
-|aom_idct8x8_12_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-
-    ; First transform rows
-    ; stage 1
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
-    ; to double the constants before multiplying to compensate this.
-    mov             r12, r3, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
-    mov             r12, r4, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_28_64)
-    vqrdmulh.s16    q4, q9, q0
-
-    mov             r12, r6, lsl #1
-    rsb             r12, #0
-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_4_64)
-    vqrdmulh.s16    q7, q9, q1
-
-    mov             r12, r5, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
-
-    ; dct_const_round_shift(- input[3] * cospi_20_64)
-    vqrdmulh.s16    q5, q11, q0
-
-    mov             r12, r7, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
-
-    ; dct_const_round_shift(input[3] * cospi_12_64)
-    vqrdmulh.s16    q6, q11, q1
-
-    ; stage 2 & stage 3 - even half
-    mov             r12, r8, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrdmulh.s16    q9, q8, q0
-
-    mov             r12, r9, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_24_64)
-    vqrdmulh.s16    q13, q10, q1
-
-    ; dct_const_round_shift(input[1] * cospi_8_64)
-    vqrdmulh.s16    q15, q10, q0
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14              ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |aom_idct8x8_12_add_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
index 7d5f64004..69470eeb0 100644
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -11,8 +11,9 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 //------------------------------------------------------------------------------
@@ -342,8 +343,6 @@ void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
 }
 
-#if !HAVE_NEON_ASM
-
 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   int i;
@@ -529,4 +528,3 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
     }
   }
 }
-#endif  // !HAVE_NEON_ASM
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
deleted file mode 100644
index fba9c1b5b..000000000
--- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm
+++ /dev/null
@@ -1,287 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_v_predictor_4x4_neon|
-    EXPORT  |aom_v_predictor_8x8_neon|
-    EXPORT  |aom_v_predictor_16x16_neon|
-    EXPORT  |aom_v_predictor_32x32_neon|
-    EXPORT  |aom_h_predictor_4x4_neon|
-    EXPORT  |aom_h_predictor_8x8_neon|
-    EXPORT  |aom_h_predictor_16x16_neon|
-    EXPORT  |aom_h_predictor_32x32_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_4x4_neon| PROC
-    vld1.32             {d0[0]}, [r2]
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_v_predictor_4x4_neon|
-
-;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_8x8_neon| PROC
-    vld1.8              {d0}, [r2]
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    vst1.8              {d0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_v_predictor_8x8_neon|
-
-;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_16x16_neon| PROC
-    vld1.8              {q0}, [r2]
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_v_predictor_16x16_neon|
-
-;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_v_predictor_32x32_neon| PROC
-    vld1.8              {q0, q1}, [r2]
-    mov                 r2, #2
-loop_v
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    vst1.8              {q0, q1}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_v
-    bx                  lr
-    ENDP                ; |aom_v_predictor_32x32_neon|
-
-;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_4x4_neon| PROC
-    vld1.32             {d1[0]}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.32             {d0[0]}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.32             {d0[0]}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_h_predictor_4x4_neon|
-
-;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                              const uint8_t *above,
-;                              const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_8x8_neon| PROC
-    vld1.64             {d1}, [r3]
-    vdup.8              d0, d1[0]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[1]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[2]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[3]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[4]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[5]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[6]
-    vst1.64             {d0}, [r0], r1
-    vdup.8              d0, d1[7]
-    vst1.64             {d0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_h_predictor_8x8_neon|
-
-;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_16x16_neon| PROC
-    vld1.8              {q1}, [r3]
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0], r1
-    bx                  lr
-    ENDP                ; |aom_h_predictor_16x16_neon|
-
-;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
-;                                const uint8_t *above,
-;                                const uint8_t *left)
-; r0  uint8_t *dst
-; r1  ptrdiff_t y_stride
-; r2  const uint8_t *above
-; r3  const uint8_t *left
-
-|aom_h_predictor_32x32_neon| PROC
-    sub                 r1, r1, #16
-    mov                 r2, #2
-loop_h
-    vld1.8              {q1}, [r3]!
-    vdup.8              q0, d2[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d2[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[0]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[1]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[2]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[3]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[4]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[5]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[6]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    vdup.8              q0, d3[7]
-    vst1.8              {q0}, [r0]!
-    vst1.8              {q0}, [r0], r1
-    subs                r2, r2, #1
-    bgt                 loop_h
-    bx                  lr
-    ENDP                ; |aom_h_predictor_32x32_neon|
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c
deleted file mode 100644
index c0562a6ea..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
-                                       uint8x16_t qlimit,   // limit
-                                       uint8x16_t qthresh,  // thresh
-                                       uint8x16_t q3,       // p3
-                                       uint8x16_t q4,       // p2
-                                       uint8x16_t q5,       // p1
-                                       uint8x16_t q6,       // p0
-                                       uint8x16_t q7,       // q0
-                                       uint8x16_t q8,       // q1
-                                       uint8x16_t q9,       // q2
-                                       uint8x16_t q10,      // q3
-                                       uint8x16_t *q5r,     // p1
-                                       uint8x16_t *q6r,     // p0
-                                       uint8x16_t *q7r,     // q0
-                                       uint8x16_t *q8r) {   // q1
-  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-  int16x8_t q2s16, q11s16;
-  uint16x8_t q4u16;
-  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
-  int8x8_t d2s8, d3s8;
-
-  q11u8 = vabdq_u8(q3, q4);
-  q12u8 = vabdq_u8(q4, q5);
-  q13u8 = vabdq_u8(q5, q6);
-  q14u8 = vabdq_u8(q8, q7);
-  q3 = vabdq_u8(q9, q8);
-  q4 = vabdq_u8(q10, q9);
-
-  q11u8 = vmaxq_u8(q11u8, q12u8);
-  q12u8 = vmaxq_u8(q13u8, q14u8);
-  q3 = vmaxq_u8(q3, q4);
-  q15u8 = vmaxq_u8(q11u8, q12u8);
-
-  q9 = vabdq_u8(q6, q7);
-
-  // aom_hevmask
-  q13u8 = vcgtq_u8(q13u8, qthresh);
-  q14u8 = vcgtq_u8(q14u8, qthresh);
-  q15u8 = vmaxq_u8(q15u8, q3);
-
-  q2u8 = vabdq_u8(q5, q8);
-  q9 = vqaddq_u8(q9, q9);
-
-  q15u8 = vcgeq_u8(qlimit, q15u8);
-
-  // aom_filter() function
-  // convert to signed
-  q10 = vdupq_n_u8(0x80);
-  q8 = veorq_u8(q8, q10);
-  q7 = veorq_u8(q7, q10);
-  q6 = veorq_u8(q6, q10);
-  q5 = veorq_u8(q5, q10);
-
-  q2u8 = vshrq_n_u8(q2u8, 1);
-  q9 = vqaddq_u8(q9, q2u8);
-
-  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
-                   vget_low_s8(vreinterpretq_s8_u8(q6)));
-  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
-                    vget_high_s8(vreinterpretq_s8_u8(q6)));
-
-  q9 = vcgeq_u8(qblimit, q9);
-
-  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
-
-  q14u8 = vorrq_u8(q13u8, q14u8);
-
-  q4u16 = vdupq_n_u16(3);
-  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
-  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
-
-  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
-  q15u8 = vandq_u8(q15u8, q9);
-
-  q1s8 = vreinterpretq_s8_u8(q1u8);
-  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
-  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
-
-  q4 = vdupq_n_u8(3);
-  q9 = vdupq_n_u8(4);
-  // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0))
-  d2s8 = vqmovn_s16(q2s16);
-  d3s8 = vqmovn_s16(q11s16);
-  q1s8 = vcombine_s8(d2s8, d3s8);
-  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
-  q1s8 = vreinterpretq_s8_u8(q1u8);
-
-  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
-  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
-  q2s8 = vshrq_n_s8(q2s8, 3);
-  q1s8 = vshrq_n_s8(q1s8, 3);
-
-  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
-  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
-
-  q1s8 = vrshrq_n_s8(q1s8, 1);
-  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
-
-  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
-  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
-
-  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
-  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
-  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
-  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
-  return;
-}
-
-void aom_lpf_horizontal_4_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
-  uint8x16_t qblimit, qlimit, qthresh;
-  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
-
-  dblimit0 = vld1_u8(blimit0);
-  dlimit0 = vld1_u8(limit0);
-  dthresh0 = vld1_u8(thresh0);
-  dblimit1 = vld1_u8(blimit1);
-  dlimit1 = vld1_u8(limit1);
-  dthresh1 = vld1_u8(thresh1);
-  qblimit = vcombine_u8(dblimit0, dblimit1);
-  qlimit = vcombine_u8(dlimit0, dlimit1);
-  qthresh = vcombine_u8(dthresh0, dthresh1);
-
-  s -= (p << 2);
-
-  q3u8 = vld1q_u8(s);
-  s += p;
-  q4u8 = vld1q_u8(s);
-  s += p;
-  q5u8 = vld1q_u8(s);
-  s += p;
-  q6u8 = vld1q_u8(s);
-  s += p;
-  q7u8 = vld1q_u8(s);
-  s += p;
-  q8u8 = vld1q_u8(s);
-  s += p;
-  q9u8 = vld1q_u8(s);
-  s += p;
-  q10u8 = vld1q_u8(s);
-
-  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
-                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
-
-  s -= (p * 5);
-  vst1q_u8(s, q5u8);
-  s += p;
-  vst1q_u8(s, q6u8);
-  s += p;
-  vst1q_u8(s, q7u8);
-  s += p;
-  vst1q_u8(s, q8u8);
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm
deleted file mode 100644
index b6e2c9edb..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm
+++ /dev/null
@@ -1,202 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_4_dual_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
-;                                    const uint8_t *blimit0,
-;                                    const uint8_t *limit0,
-;                                    const uint8_t *thresh0,
-;                                    const uint8_t *blimit1,
-;                                    const uint8_t *limit1,
-;                                    const uint8_t *thresh1)
-; r0    uint8_t *s,
-; r1    int p,
-; r2    const uint8_t *blimit0,
-; r3    const uint8_t *limit0,
-; sp    const uint8_t *thresh0,
-; sp+4  const uint8_t *blimit1,
-; sp+8  const uint8_t *limit1,
-; sp+12 const uint8_t *thresh1,
-
-|aom_lpf_horizontal_4_dual_neon| PROC
-    push        {lr}
-
-    ldr         r12, [sp, #4]              ; load thresh0
-    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
-    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
-
-    add         r1, r1, r1                 ; double pitch
-    ldr         r2, [sp, #8]               ; load blimit1
-
-    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
-
-    ldr         r3, [sp, #12]              ; load limit1
-    ldr         r12, [sp, #16]             ; load thresh1
-    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
-
-    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
-
-    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
-    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
-
-    vpush       {d8-d15}                   ; save neon registers
-
-    add         r3, r2, r1, lsr #1         ; s[-3 * p]
-
-    vld1.u8     {q3}, [r2@64], r1          ; p3
-    vld1.u8     {q4}, [r3@64], r1          ; p2
-    vld1.u8     {q5}, [r2@64], r1          ; p1
-    vld1.u8     {q6}, [r3@64], r1          ; p0
-    vld1.u8     {q7}, [r2@64], r1          ; q0
-    vld1.u8     {q8}, [r3@64], r1          ; q1
-    vld1.u8     {q9}, [r2@64]              ; q2
-    vld1.u8     {q10}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          aom_loop_filter_neon_16
-
-    vst1.u8     {q5}, [r2@64], r1          ; store op1
-    vst1.u8     {q6}, [r3@64], r1          ; store op0
-    vst1.u8     {q7}, [r2@64], r1          ; store oq0
-    vst1.u8     {q8}, [r3@64], r1          ; store oq1
-
-    vpop        {d8-d15}                   ; restore neon registers
-
-    pop         {pc}
-    ENDP        ; |aom_lpf_horizontal_4_dual_neon|
-
-; void aom_loop_filter_neon_16();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. This function uses
-; registers d8-d15, so the calling function must save those registers.
-;
-; r0-r3, r12 PRESERVE
-; q0    blimit
-; q1    limit
-; q2    thresh
-; q3    p3
-; q4    p2
-; q5    p1
-; q6    p0
-; q7    q0
-; q8    q1
-; q9    q2
-; q10   q3
-;
-; Outputs:
-; q5    op1
-; q6    op0
-; q7    oq0
-; q8    oq1
-|aom_loop_filter_neon_16| PROC
-
-    ; filter_mask
-    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
-    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
-    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
-    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
-    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
-    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
-
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
-
-    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
-
-    vmov.u8     q10, #0x80
-
-    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
-
-    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
-
-    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
-    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-
-    veor        q7, q7, q10                 ; qs0
-
-    vcge.u8     q15, q1, q15                ; abs(m11) > limit
-
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    veor        q6, q6, q10                 ; ps0
-
-    veor        q5, q5, q10                 ; ps1
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-
-    veor        q8, q8, q10                 ; qs1
-
-    vmov.u16    q4, #3
-
-    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
-    vsubl.s8    q11, d15, d13
-
-    vcge.u8     q9, q0, q9                  ; a > blimit
-
-    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
-    vorr        q14, q13, q14               ; hev
-
-    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
-    vmul.i16    q11, q11, q4
-
-    vand        q1, q1, q14                 ; filter &= hev
-    vand        q15, q15, q9                ; mask
-
-    vmov.u8     q4, #3
-
-    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
-    vaddw.s8    q11, q11, d3
-
-    vmov.u8     q9, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d2, q2
-    vqmovn.s16  d3, q11
-    vand        q1, q1, q15                 ; filter &= mask
-
-    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
-    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
-    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
-
-
-    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
-    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
-
-    veor        q7, q0,  q10                ; *oq0 = u^0x80
-
-    vbic        q1, q1, q14                 ; filter &= ~hev
-
-    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
-    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
-
-    veor        q6, q11, q10                ; *op0 = u^0x80
-    veor        q5, q13, q10                ; *op1 = u^0x80
-    veor        q8, q12, q10                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |aom_loop_filter_neon_16|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c
deleted file mode 100644
index 2b1f80b81..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
-                                    uint8x8_t dlimit,    // limit
-                                    uint8x8_t dthresh,   // thresh
-                                    uint8x8_t d3u8,      // p3
-                                    uint8x8_t d4u8,      // p2
-                                    uint8x8_t d5u8,      // p1
-                                    uint8x8_t d6u8,      // p0
-                                    uint8x8_t d7u8,      // q0
-                                    uint8x8_t d16u8,     // q1
-                                    uint8x8_t d17u8,     // q2
-                                    uint8x8_t d18u8,     // q3
-                                    uint8x8_t *d4ru8,    // p1
-                                    uint8x8_t *d5ru8,    // p0
-                                    uint8x8_t *d6ru8,    // q0
-                                    uint8x8_t *d7ru8) {  // q1
-  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
-  int16x8_t q12s16;
-  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
-  d19u8 = vabd_u8(d3u8, d4u8);
-  d20u8 = vabd_u8(d4u8, d5u8);
-  d21u8 = vabd_u8(d5u8, d6u8);
-  d22u8 = vabd_u8(d16u8, d7u8);
-  d3u8 = vabd_u8(d17u8, d16u8);
-  d4u8 = vabd_u8(d18u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-  d20u8 = vmax_u8(d21u8, d22u8);
-  d3u8 = vmax_u8(d3u8, d4u8);
-  d23u8 = vmax_u8(d19u8, d20u8);
-
-  d17u8 = vabd_u8(d6u8, d7u8);
-
-  d21u8 = vcgt_u8(d21u8, dthresh);
-  d22u8 = vcgt_u8(d22u8, dthresh);
-  d23u8 = vmax_u8(d23u8, d3u8);
-
-  d28u8 = vabd_u8(d5u8, d16u8);
-  d17u8 = vqadd_u8(d17u8, d17u8);
-
-  d23u8 = vcge_u8(dlimit, d23u8);
-
-  d18u8 = vdup_n_u8(0x80);
-  d5u8 = veor_u8(d5u8, d18u8);
-  d6u8 = veor_u8(d6u8, d18u8);
-  d7u8 = veor_u8(d7u8, d18u8);
-  d16u8 = veor_u8(d16u8, d18u8);
-
-  d28u8 = vshr_n_u8(d28u8, 1);
-  d17u8 = vqadd_u8(d17u8, d28u8);
-
-  d19u8 = vdup_n_u8(3);
-
-  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
-
-  d17u8 = vcge_u8(dblimit, d17u8);
-
-  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
-
-  d22u8 = vorr_u8(d21u8, d22u8);
-
-  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
-  d23u8 = vand_u8(d23u8, d17u8);
-
-  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
-  d17u8 = vdup_n_u8(4);
-
-  d27s8 = vqmovn_s16(q12s16);
-  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
-  d27s8 = vreinterpret_s8_u8(d27u8);
-
-  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
-  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
-  d28s8 = vshr_n_s8(d28s8, 3);
-  d27s8 = vshr_n_s8(d27s8, 3);
-
-  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
-  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
-  d27s8 = vrshr_n_s8(d27s8, 1);
-  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
-  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
-  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
-  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
-  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
-  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
-  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
-  return;
-}
-
-void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s, *psrc;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  psrc = src - (pitch << 2);
-  for (i = 0; i < 1; i++) {
-    s = psrc + i * 8;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
-    s -= (pitch * 5);
-    vst1_u8(s, d4u8);
-    s += pitch;
-    vst1_u8(s, d5u8);
-    s += pitch;
-    vst1_u8(s, d6u8);
-    s += pitch;
-    vst1_u8(s, d7u8);
-  }
-  return;
-}
-
-void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  int i, pitch8;
-  uint8_t *s;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-  uint8x8x4_t d4Result;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  pitch8 = pitch * 8;
-  for (i = 0; i < 1; i++, src += pitch8) {
-    s = src - (i + 1) * 4;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
-    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
-    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
-    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
-    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                      vreinterpret_u16_u32(d2tmp2.val[0]));
-    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                      vreinterpret_u16_u32(d2tmp3.val[0]));
-    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                      vreinterpret_u16_u32(d2tmp2.val[1]));
-    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                      vreinterpret_u16_u32(d2tmp3.val[1]));
-
-    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                     vreinterpret_u8_u16(d2tmp5.val[0]));
-    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                     vreinterpret_u8_u16(d2tmp5.val[1]));
-    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                      vreinterpret_u8_u16(d2tmp7.val[0]));
-    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                      vreinterpret_u8_u16(d2tmp7.val[1]));
-
-    d3u8 = d2tmp8.val[0];
-    d4u8 = d2tmp8.val[1];
-    d5u8 = d2tmp9.val[0];
-    d6u8 = d2tmp9.val[1];
-    d7u8 = d2tmp10.val[0];
-    d16u8 = d2tmp10.val[1];
-    d17u8 = d2tmp11.val[0];
-    d18u8 = d2tmp11.val[1];
-
-    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
-
-    d4Result.val[0] = d4u8;
-    d4Result.val[1] = d5u8;
-    d4Result.val[2] = d6u8;
-    d4Result.val[3] = d7u8;
-
-    src -= 2;
-    vst4_lane_u8(src, d4Result, 0);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 1);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 2);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 3);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 4);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 5);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 6);
-    src += pitch;
-    vst4_lane_u8(src, d4Result, 7);
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm
deleted file mode 100644
index 8b54984d5..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm
+++ /dev/null
@@ -1,252 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_4_neon|
-    EXPORT  |aom_lpf_vertical_4_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; Currently aom only works on iterations 8 at a time. The aom loop filter
-; works on 16 iterations at a time.
-;
-; void aom_lpf_horizontal_4_neon(uint8_t *s,
-;                                int p /* pitch */,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_horizontal_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          aom_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    pop         {pc}
-    ENDP        ; |aom_lpf_horizontal_4_neon|
-
-; Currently aom only works on iterations 8 at a time. The aom loop filter
-; works on 16 iterations at a time.
-;
-; void aom_lpf_vertical_4_neon(uint8_t *s,
-;                              int p /* pitch */,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_vertical_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          aom_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    pop         {pc}
-    ENDP        ; |aom_lpf_vertical_4_neon|
-
-; void aom_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|aom_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |aom_loop_filter_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c
deleted file mode 100644
index c4502fdb5..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
-                                      uint8x8_t dlimit,    // limit
-                                      uint8x8_t dthresh,   // thresh
-                                      uint8x8_t d3u8,      // p2
-                                      uint8x8_t d4u8,      // p2
-                                      uint8x8_t d5u8,      // p1
-                                      uint8x8_t d6u8,      // p0
-                                      uint8x8_t d7u8,      // q0
-                                      uint8x8_t d16u8,     // q1
-                                      uint8x8_t d17u8,     // q2
-                                      uint8x8_t d18u8,     // q3
-                                      uint8x8_t *d0ru8,    // p1
-                                      uint8x8_t *d1ru8,    // p1
-                                      uint8x8_t *d2ru8,    // p0
-                                      uint8x8_t *d3ru8,    // q0
-                                      uint8x8_t *d4ru8,    // q1
-                                      uint8x8_t *d5ru8) {  // q1
-  uint32_t flat;
-  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
-  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-  int16x8_t q15s16;
-  uint16x8_t q10u16, q14u16;
-  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
-  d19u8 = vabd_u8(d3u8, d4u8);
-  d20u8 = vabd_u8(d4u8, d5u8);
-  d21u8 = vabd_u8(d5u8, d6u8);
-  d22u8 = vabd_u8(d16u8, d7u8);
-  d23u8 = vabd_u8(d17u8, d16u8);
-  d24u8 = vabd_u8(d18u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-  d20u8 = vmax_u8(d21u8, d22u8);
-
-  d25u8 = vabd_u8(d6u8, d4u8);
-
-  d23u8 = vmax_u8(d23u8, d24u8);
-
-  d26u8 = vabd_u8(d7u8, d17u8);
-
-  d19u8 = vmax_u8(d19u8, d20u8);
-
-  d24u8 = vabd_u8(d6u8, d7u8);
-  d27u8 = vabd_u8(d3u8, d6u8);
-  d28u8 = vabd_u8(d18u8, d7u8);
-
-  d19u8 = vmax_u8(d19u8, d23u8);
-
-  d23u8 = vabd_u8(d5u8, d16u8);
-  d24u8 = vqadd_u8(d24u8, d24u8);
-
-  d19u8 = vcge_u8(dlimit, d19u8);
-
-  d25u8 = vmax_u8(d25u8, d26u8);
-  d26u8 = vmax_u8(d27u8, d28u8);
-
-  d23u8 = vshr_n_u8(d23u8, 1);
-
-  d25u8 = vmax_u8(d25u8, d26u8);
-
-  d24u8 = vqadd_u8(d24u8, d23u8);
-
-  d20u8 = vmax_u8(d20u8, d25u8);
-
-  d23u8 = vdup_n_u8(1);
-  d24u8 = vcge_u8(dblimit, d24u8);
-
-  d21u8 = vcgt_u8(d21u8, dthresh);
-
-  d20u8 = vcge_u8(d23u8, d20u8);
-
-  d19u8 = vand_u8(d19u8, d24u8);
-
-  d23u8 = vcgt_u8(d22u8, dthresh);
-
-  d20u8 = vand_u8(d20u8, d19u8);
-
-  d22u8 = vdup_n_u8(0x80);
-
-  d23u8 = vorr_u8(d21u8, d23u8);
-
-  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
-
-  d30u8 = vshrn_n_u16(q10u16, 4);
-  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
-  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
-    d27u8 = vdup_n_u8(3);
-    d21u8 = vdup_n_u8(2);
-    q14u16 = vaddl_u8(d6u8, d7u8);
-    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vsubw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vsubw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
-  } else {
-    d21u8 = veor_u8(d7u8, d22u8);
-    d24u8 = veor_u8(d6u8, d22u8);
-    d25u8 = veor_u8(d5u8, d22u8);
-    d26u8 = veor_u8(d16u8, d22u8);
-
-    d27u8 = vdup_n_u8(3);
-
-    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
-    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
-    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
-    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-    q15s16 = vaddw_s8(q15s16, d29s8);
-
-    d29u8 = vdup_n_u8(4);
-
-    d28s8 = vqmovn_s16(q15s16);
-
-    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
-    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
-    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
-    d30s8 = vshr_n_s8(d30s8, 3);
-    d29s8 = vshr_n_s8(d29s8, 3);
-
-    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
-    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
-    d29s8 = vrshr_n_s8(d29s8, 1);
-    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
-    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
-    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
-    if (flat == 0) {  // filter_branch_only
-      *d0ru8 = d4u8;
-      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-      *d5ru8 = d17u8;
-      return;
-    }
-
-    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
-    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
-    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
-    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
-    d23u8 = vdup_n_u8(2);
-    q14u16 = vaddl_u8(d6u8, d7u8);
-    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
-    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
-    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
-    q14u16 = vaddw_u8(q14u16, d5u8);
-
-    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
-    d30u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vaddw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-
-    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
-    d31u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vaddw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-
-    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
-    d23u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d3u8);
-    q14u16 = vsubw_u8(q14u16, d6u8);
-    q14u16 = vaddw_u8(q14u16, d7u8);
-
-    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
-    d22u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d4u8);
-    q14u16 = vsubw_u8(q14u16, d7u8);
-    q14u16 = vaddw_u8(q14u16, d16u8);
-
-    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
-    d6u8 = vqrshrn_n_u16(q14u16, 3);
-
-    q14u16 = vsubw_u8(q14u16, d5u8);
-    q14u16 = vsubw_u8(q14u16, d16u8);
-    q14u16 = vaddw_u8(q14u16, d17u8);
-    q14u16 = vaddw_u8(q14u16, d18u8);
-
-    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
-    d7u8 = vqrshrn_n_u16(q14u16, 3);
-
-    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
-    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
-    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
-  }
-  return;
-}
-
-void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s, *psrc;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  uint8x8_t d16u8, d17u8, d18u8;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  psrc = src - (pitch << 2);
-  for (i = 0; i < 1; i++) {
-    s = psrc + i * 8;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
-                       &d5u8);
-
-    s -= (pitch * 6);
-    vst1_u8(s, d0u8);
-    s += pitch;
-    vst1_u8(s, d1u8);
-    s += pitch;
-    vst1_u8(s, d2u8);
-    s += pitch;
-    vst1_u8(s, d3u8);
-    s += pitch;
-    vst1_u8(s, d4u8);
-    s += pitch;
-    vst1_u8(s, d5u8);
-  }
-  return;
-}
-
-void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  uint8_t *s;
-  uint8x8_t dblimit, dlimit, dthresh;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  uint8x8_t d16u8, d17u8, d18u8;
-  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
-  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
-  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
-  uint8x8x4_t d4Result;
-  uint8x8x2_t d2Result;
-
-  dblimit = vld1_u8(blimit);
-  dlimit = vld1_u8(limit);
-  dthresh = vld1_u8(thresh);
-
-  for (i = 0; i < 1; i++) {
-    s = src + (i * (pitch << 3)) - 4;
-
-    d3u8 = vld1_u8(s);
-    s += pitch;
-    d4u8 = vld1_u8(s);
-    s += pitch;
-    d5u8 = vld1_u8(s);
-    s += pitch;
-    d6u8 = vld1_u8(s);
-    s += pitch;
-    d7u8 = vld1_u8(s);
-    s += pitch;
-    d16u8 = vld1_u8(s);
-    s += pitch;
-    d17u8 = vld1_u8(s);
-    s += pitch;
-    d18u8 = vld1_u8(s);
-
-    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
-    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
-    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
-    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
-
-    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
-                      vreinterpret_u16_u32(d2tmp2.val[0]));
-    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
-                      vreinterpret_u16_u32(d2tmp3.val[0]));
-    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
-                      vreinterpret_u16_u32(d2tmp2.val[1]));
-    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
-                      vreinterpret_u16_u32(d2tmp3.val[1]));
-
-    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
-                     vreinterpret_u8_u16(d2tmp5.val[0]));
-    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
-                     vreinterpret_u8_u16(d2tmp5.val[1]));
-    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
-                      vreinterpret_u8_u16(d2tmp7.val[0]));
-    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
-                      vreinterpret_u8_u16(d2tmp7.val[1]));
-
-    d3u8 = d2tmp8.val[0];
-    d4u8 = d2tmp8.val[1];
-    d5u8 = d2tmp9.val[0];
-    d6u8 = d2tmp9.val[1];
-    d7u8 = d2tmp10.val[0];
-    d16u8 = d2tmp10.val[1];
-    d17u8 = d2tmp11.val[0];
-    d18u8 = d2tmp11.val[1];
-
-    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
-                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
-                       &d5u8);
-
-    d4Result.val[0] = d0u8;
-    d4Result.val[1] = d1u8;
-    d4Result.val[2] = d2u8;
-    d4Result.val[3] = d3u8;
-
-    d2Result.val[0] = d4u8;
-    d2Result.val[1] = d5u8;
-
-    s = src - 3;
-    vst4_lane_u8(s, d4Result, 0);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 1);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 2);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 3);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 4);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 5);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 6);
-    s += pitch;
-    vst4_lane_u8(s, d4Result, 7);
-
-    s = src + 1;
-    vst2_lane_u8(s, d2Result, 0);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 1);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 2);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 3);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 4);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 5);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 6);
-    s += pitch;
-    vst2_lane_u8(s, d2Result, 7);
-  }
-  return;
-}
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm
deleted file mode 100644
index 9f3db66ee..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm
+++ /dev/null
@@ -1,428 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_8_neon|
-    EXPORT  |aom_lpf_vertical_8_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; Currently aom only works on iterations 8 at a time. The aom loop filter
-; works on 16 iterations at a time.
-;
-; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_horizontal_8_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r2, [sp, #12]              ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r2, r3, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r3@64], r1          ; p3
-    vld1.u8     {d4}, [r2@64], r1          ; p2
-    vld1.u8     {d5}, [r3@64], r1          ; p1
-    vld1.u8     {d6}, [r2@64], r1          ; p0
-    vld1.u8     {d7}, [r3@64], r1          ; q0
-    vld1.u8     {d16}, [r2@64], r1         ; q1
-    vld1.u8     {d17}, [r3@64]             ; q2
-    vld1.u8     {d18}, [r2@64], r1         ; q3
-
-    sub         r3, r3, r1, lsl #1
-    sub         r2, r2, r1, lsl #2
-
-    bl          aom_mbloop_filter_neon
-
-    vst1.u8     {d0}, [r2@64], r1          ; store op2
-    vst1.u8     {d1}, [r3@64], r1          ; store op1
-    vst1.u8     {d2}, [r2@64], r1          ; store op0
-    vst1.u8     {d3}, [r3@64], r1          ; store oq0
-    vst1.u8     {d4}, [r2@64], r1          ; store oq1
-    vst1.u8     {d5}, [r3@64], r1          ; store oq2
-
-    pop         {r4-r5, pc}
-
-    ENDP        ; |aom_lpf_horizontal_8_neon|
-
-; void aom_lpf_vertical_8_neon(uint8_t *s,
-;                              int pitch,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh)
-;
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_vertical_8_neon| PROC
-    push        {r4-r5, lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #12]             ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    sub         r2, r0, #3
-    add         r3, r0, #1
-
-    bl          aom_mbloop_filter_neon
-
-    ;store op2, op1, op0, oq0
-    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
-    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
-    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
-    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
-    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
-    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
-    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
-    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
-
-    ;store oq1, oq2
-    vst2.8      {d4[0], d5[0]}, [r3], r1
-    vst2.8      {d4[1], d5[1]}, [r3], r1
-    vst2.8      {d4[2], d5[2]}, [r3], r1
-    vst2.8      {d4[3], d5[3]}, [r3], r1
-    vst2.8      {d4[4], d5[4]}, [r3], r1
-    vst2.8      {d4[5], d5[5]}, [r3], r1
-    vst2.8      {d4[6], d5[6]}, [r3], r1
-    vst2.8      {d4[7], d5[7]}, [r3]
-
-    pop         {r4-r5, pc}
-    ENDP        ; |aom_lpf_vertical_8_neon|
-
-; void aom_mbloop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d0    op2
-; d1    op1
-; d2    op0
-; d3    oq0
-; d4    oq1
-; d5    oq2
-|aom_mbloop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
-    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
-    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
-
-    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
-
-    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
-
-    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
-
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
-    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
-    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d1, d19
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
-    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-
-    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
-
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
-
-    vmov.u8     d23, #1
-    vcge.u8     d24, d0, d24               ; a > blimit
-
-    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
-
-    vcge.u8     d20, d23, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
-
-    vand        d20, d20, d19              ; flat & mask
-
-    vmov.u8     d22, #0x80
-
-    vorr        d23, d21, d23              ; hev
-
-    ; This instruction will truncate the "flat & mask" masks down to 4 bits
-    ; each to fit into one 32 bit arm register. The values are stored in
-    ; q10.64[0].
-    vshrn.u16   d30, q10, #4
-    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
-
-    adds        r5, r4, #1                 ; Check for all 1's
-
-    ; If mask and flat are 1's for all vectors, then we only need to execute
-    ; the power branch for all vectors.
-    beq         power_branch_only
-
-    cmp         r4, #0                     ; Check for 0, set flag for later
-
-    ; mbfilter() function
-    ; filter() function
-    ; convert to signed
-    veor        d21, d7, d22               ; qs0
-    veor        d24, d6, d22               ; ps0
-    veor        d25, d5, d22               ; ps1
-    veor        d26, d16, d22              ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
-
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-
-    vand        d29, d29, d23              ; filter &= hev
-
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d23              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    ; If mask and flat are 0's for all vectors, then we only need to execute
-    ; the filter branch for all vectors.
-    beq         filter_branch_only
-
-    ; If mask and flat are mixed then we must perform both branches and
-    ; combine the data.
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d21, d21, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    ; At this point we have already executed the filter branch. The filter
-    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
-    ; branch and combine the data.
-    vmov.u8     d23, #2
-    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
-    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
-
-    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
-
-    vaddw.u8    q14, d5                    ; r_op2 += p1
-
-    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
-
-    vqrshrn.u16 d30, q14, #3               ; r_op2
-
-    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
-    vsubw.u8    q14, d4                    ; r_op1 -= p2
-    vaddw.u8    q14, d5                    ; r_op1 += p1
-    vaddw.u8    q14, d16                   ; r_op1 += q1
-
-    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
-
-    vqrshrn.u16 d31, q14, #3               ; r_op1
-
-    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
-    vsubw.u8    q14, d5                    ; r_op0 -= p1
-    vaddw.u8    q14, d6                    ; r_op0 += p0
-    vaddw.u8    q14, d17                   ; r_op0 += q2
-
-    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
-
-    vqrshrn.u16 d23, q14, #3               ; r_op0
-
-    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
-    vsubw.u8    q14, d6                    ; r_oq0 -= p0
-    vaddw.u8    q14, d7                    ; r_oq0 += q0
-
-    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
-
-    vaddw.u8    q14, d18                   ; oq0 += q3
-
-    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
-
-    vqrshrn.u16 d22, q14, #3               ; r_oq0
-
-    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
-    vsubw.u8    q14, d7                    ; r_oq1 -= q0
-    vaddw.u8    q14, d16                   ; r_oq1 += q1
-
-    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
-
-    vaddw.u8    q14, d18                   ; r_oq1 += q3
-
-    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
-
-    vqrshrn.u16 d6, q14, #3                ; r_oq1
-
-    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
-    vsubw.u8    q14, d16                   ; r_oq2 -= q1
-    vaddw.u8    q14, d17                   ; r_oq2 += q2
-    vaddw.u8    q14, d18                   ; r_oq2 += q3
-
-    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
-
-    vqrshrn.u16 d7, q14, #3                ; r_oq2
-
-    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
-    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
-    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
-
-    bx          lr
-
-power_branch_only
-    vmov.u8     d27, #3
-    vmov.u8     d21, #2
-    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
-    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
-    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
-    vaddw.u8    q14, d5                    ; op2 += p1
-    vqrshrn.u16 d0, q14, #3                ; op2
-
-    vsubw.u8    q14, d3                    ; op1 = op2 - p3
-    vsubw.u8    q14, d4                    ; op1 -= p2
-    vaddw.u8    q14, d5                    ; op1 += p1
-    vaddw.u8    q14, d16                   ; op1 += q1
-    vqrshrn.u16 d1, q14, #3                ; op1
-
-    vsubw.u8    q14, d3                    ; op0 = op1 - p3
-    vsubw.u8    q14, d5                    ; op0 -= p1
-    vaddw.u8    q14, d6                    ; op0 += p0
-    vaddw.u8    q14, d17                   ; op0 += q2
-    vqrshrn.u16 d2, q14, #3                ; op0
-
-    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
-    vsubw.u8    q14, d6                    ; oq0 -= p0
-    vaddw.u8    q14, d7                    ; oq0 += q0
-    vaddw.u8    q14, d18                   ; oq0 += q3
-    vqrshrn.u16 d3, q14, #3                ; oq0
-
-    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
-    vsubw.u8    q14, d7                    ; oq1 -= q0
-    vaddw.u8    q14, d16                   ; oq1 += q1
-    vaddw.u8    q14, d18                   ; oq1 += q3
-    vqrshrn.u16 d4, q14, #3                ; oq1
-
-    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
-    vsubw.u8    q14, d16                   ; oq2 -= q1
-    vaddw.u8    q14, d17                   ; oq2 += q2
-    vaddw.u8    q14, d18                   ; oq2 += q3
-    vqrshrn.u16 d5, q14, #3                ; oq2
-
-    bx          lr
-
-filter_branch_only
-    ; TODO(fgalligan): See if we can rearange registers so we do not need to
-    ; do the 2 vswp.
-    vswp        d0, d4                      ; op2
-    vswp        d5, d17                     ; oq2
-    veor        d2, d24, d22                ; *op0 = u^0x80
-    veor        d3, d21, d22                ; *oq0 = u^0x80
-    veor        d1, d25, d22                ; *op1 = u^0x80
-    veor        d4, d26, d22                ; *oq1 = u^0x80
-
-    bx          lr
-
-    ENDP        ; |aom_mbloop_filter_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm
deleted file mode 100644
index 675928860..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm
+++ /dev/null
@@ -1,638 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-    EXPORT  |aom_lpf_horizontal_edge_8_neon|
-    EXPORT  |aom_lpf_horizontal_edge_16_neon|
-    EXPORT  |aom_lpf_vertical_16_neon|
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; void mb_lpf_horizontal_edge(uint8_t *s, int p,
-;                             const uint8_t *blimit,
-;                             const uint8_t *limit,
-;                             const uint8_t *thresh,
-;                             int count)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; r12   int count
-|mb_lpf_horizontal_edge| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-h_count
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
-
-    vld1.u8     {d0}, [r8@64], r1          ; p7
-    vld1.u8     {d1}, [r8@64], r1          ; p6
-    vld1.u8     {d2}, [r8@64], r1          ; p5
-    vld1.u8     {d3}, [r8@64], r1          ; p4
-    vld1.u8     {d4}, [r8@64], r1          ; p3
-    vld1.u8     {d5}, [r8@64], r1          ; p2
-    vld1.u8     {d6}, [r8@64], r1          ; p1
-    vld1.u8     {d7}, [r8@64], r1          ; p0
-    vld1.u8     {d8}, [r8@64], r1          ; q0
-    vld1.u8     {d9}, [r8@64], r1          ; q1
-    vld1.u8     {d10}, [r8@64], r1         ; q2
-    vld1.u8     {d11}, [r8@64], r1         ; q3
-    vld1.u8     {d12}, [r8@64], r1         ; q4
-    vld1.u8     {d13}, [r8@64], r1         ; q5
-    vld1.u8     {d14}, [r8@64], r1         ; q6
-    vld1.u8     {d15}, [r8@64], r1         ; q7
-
-    bl          aom_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         h_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, r1, lsl #1
-
-    vst1.u8     {d25}, [r8@64], r1         ; store op1
-    vst1.u8     {d24}, [r8@64], r1         ; store op0
-    vst1.u8     {d23}, [r8@64], r1         ; store oq0
-    vst1.u8     {d26}, [r8@64], r1         ; store oq1
-
-    b           h_next
-
-h_mbfilter
-    tst         r7, #2
-    beq         h_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, r1, lsl #1
-    sub         r8, r8, r1
-
-    vst1.u8     {d18}, [r8@64], r1         ; store op2
-    vst1.u8     {d19}, [r8@64], r1         ; store op1
-    vst1.u8     {d20}, [r8@64], r1         ; store op0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq0
-    vst1.u8     {d22}, [r8@64], r1         ; store oq1
-    vst1.u8     {d23}, [r8@64], r1         ; store oq2
-
-    b           h_next
-
-h_wide_mbfilter
-    sub         r8, r0, r1, lsl #3
-    add         r8, r8, r1
-
-    vst1.u8     {d16}, [r8@64], r1         ; store op6
-    vst1.u8     {d24}, [r8@64], r1         ; store op5
-    vst1.u8     {d25}, [r8@64], r1         ; store op4
-    vst1.u8     {d26}, [r8@64], r1         ; store op3
-    vst1.u8     {d27}, [r8@64], r1         ; store op2
-    vst1.u8     {d18}, [r8@64], r1         ; store op1
-    vst1.u8     {d19}, [r8@64], r1         ; store op0
-    vst1.u8     {d20}, [r8@64], r1         ; store oq0
-    vst1.u8     {d21}, [r8@64], r1         ; store oq1
-    vst1.u8     {d22}, [r8@64], r1         ; store oq2
-    vst1.u8     {d23}, [r8@64], r1         ; store oq3
-    vst1.u8     {d1}, [r8@64], r1          ; store oq4
-    vst1.u8     {d2}, [r8@64], r1          ; store oq5
-    vst1.u8     {d3}, [r8@64], r1          ; store oq6
-
-h_next
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         h_count
-
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |mb_lpf_horizontal_edge|
-
-; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
-;                                     const uint8_t *blimit,
-;                                     const uint8_t *limit,
-;                                     const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|aom_lpf_horizontal_edge_8_neon| PROC
-    mov r12, #1
-    b mb_lpf_horizontal_edge
-    ENDP        ; |aom_lpf_horizontal_edge_8_neon|
-
-; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
-;                                      const uint8_t *blimit,
-;                                      const uint8_t *limit,
-;                                      const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int pitch,
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|aom_lpf_horizontal_edge_16_neon| PROC
-    mov r12, #2
-    b mb_lpf_horizontal_edge
-    ENDP        ; |aom_lpf_horizontal_edge_16_neon|
-
-; void aom_lpf_vertical_16_neon(uint8_t *s, int p,
-;                               const uint8_t *blimit,
-;                               const uint8_t *limit,
-;                               const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-|aom_lpf_vertical_16_neon| PROC
-    push        {r4-r8, lr}
-    vpush       {d8-d15}
-    ldr         r4, [sp, #88]              ; load thresh
-
-    vld1.8      {d16[]}, [r2]              ; load *blimit
-    vld1.8      {d17[]}, [r3]              ; load *limit
-    vld1.8      {d18[]}, [r4]              ; load *thresh
-
-    sub         r8, r0, #8
-
-    vld1.8      {d0}, [r8@64], r1
-    vld1.8      {d8}, [r0@64], r1
-    vld1.8      {d1}, [r8@64], r1
-    vld1.8      {d9}, [r0@64], r1
-    vld1.8      {d2}, [r8@64], r1
-    vld1.8      {d10}, [r0@64], r1
-    vld1.8      {d3}, [r8@64], r1
-    vld1.8      {d11}, [r0@64], r1
-    vld1.8      {d4}, [r8@64], r1
-    vld1.8      {d12}, [r0@64], r1
-    vld1.8      {d5}, [r8@64], r1
-    vld1.8      {d13}, [r0@64], r1
-    vld1.8      {d6}, [r8@64], r1
-    vld1.8      {d14}, [r0@64], r1
-    vld1.8      {d7}, [r8@64], r1
-    vld1.8      {d15}, [r0@64], r1
-
-    sub         r0, r0, r1, lsl #3
-
-    vtrn.32     q0, q2
-    vtrn.32     q1, q3
-    vtrn.32     q4, q6
-    vtrn.32     q5, q7
-
-    vtrn.16     q0, q1
-    vtrn.16     q2, q3
-    vtrn.16     q4, q5
-    vtrn.16     q6, q7
-
-    vtrn.8      d0, d1
-    vtrn.8      d2, d3
-    vtrn.8      d4, d5
-    vtrn.8      d6, d7
-
-    vtrn.8      d8, d9
-    vtrn.8      d10, d11
-    vtrn.8      d12, d13
-    vtrn.8      d14, d15
-
-    bl          aom_wide_mbfilter_neon
-
-    tst         r7, #1
-    beq         v_mbfilter
-
-    ; flat && mask were not set for any of the channels. Just store the values
-    ; from filter.
-    sub         r8, r0, #2
-
-    vswp        d23, d25
-
-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
-
-    b           v_end
-
-v_mbfilter
-    tst         r7, #2
-    beq         v_wide_mbfilter
-
-    ; flat2 was not set for any of the channels. Just store the values from
-    ; mbfilter.
-    sub         r8, r0, #3
-
-    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
-    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
-    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
-    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
-    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
-    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
-    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
-    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
-    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
-    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
-    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
-    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
-    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
-    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
-    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
-    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
-
-    b           v_end
-
-v_wide_mbfilter
-    sub         r8, r0, #8
-
-    vtrn.32     d0,  d26
-    vtrn.32     d16, d27
-    vtrn.32     d24, d18
-    vtrn.32     d25, d19
-
-    vtrn.16     d0,  d24
-    vtrn.16     d16, d25
-    vtrn.16     d26, d18
-    vtrn.16     d27, d19
-
-    vtrn.8      d0,  d16
-    vtrn.8      d24, d25
-    vtrn.8      d26, d27
-    vtrn.8      d18, d19
-
-    vtrn.32     d20, d1
-    vtrn.32     d21, d2
-    vtrn.32     d22, d3
-    vtrn.32     d23, d15
-
-    vtrn.16     d20, d22
-    vtrn.16     d21, d23
-    vtrn.16     d1,  d3
-    vtrn.16     d2,  d15
-
-    vtrn.8      d20, d21
-    vtrn.8      d22, d23
-    vtrn.8      d1,  d2
-    vtrn.8      d3,  d15
-
-    vst1.8      {d0}, [r8@64], r1
-    vst1.8      {d20}, [r0@64], r1
-    vst1.8      {d16}, [r8@64], r1
-    vst1.8      {d21}, [r0@64], r1
-    vst1.8      {d24}, [r8@64], r1
-    vst1.8      {d22}, [r0@64], r1
-    vst1.8      {d25}, [r8@64], r1
-    vst1.8      {d23}, [r0@64], r1
-    vst1.8      {d26}, [r8@64], r1
-    vst1.8      {d1}, [r0@64], r1
-    vst1.8      {d27}, [r8@64], r1
-    vst1.8      {d2}, [r0@64], r1
-    vst1.8      {d18}, [r8@64], r1
-    vst1.8      {d3}, [r0@64], r1
-    vst1.8      {d19}, [r8@64], r1
-    vst1.8      {d15}, [r0@64], r1
-
-v_end
-    vpop        {d8-d15}
-    pop         {r4-r8, pc}
-
-    ENDP        ; |aom_lpf_vertical_16_neon|
-
-; void aom_wide_mbfilter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-;
-; r0-r3 PRESERVE
-; d16    blimit
-; d17    limit
-; d18    thresh
-; d0    p7
-; d1    p6
-; d2    p5
-; d3    p4
-; d4    p3
-; d5    p2
-; d6    p1
-; d7    p0
-; d8    q0
-; d9    q1
-; d10   q2
-; d11   q3
-; d12   q4
-; d13   q5
-; d14   q6
-; d15   q7
-|aom_wide_mbfilter_neon| PROC
-    mov         r7, #0
-
-    ; filter_mask
-    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
-    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
-    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
-    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
-    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
-    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
-    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
-    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
-    vmax.u8     d19, d19, d20
-
-    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
-
-    vmax.u8     d19, d19, d23
-
-    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
-    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
-
-    ; abs () > limit
-    vcge.u8     d19, d17, d19
-
-    ; flatmask4
-    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
-    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
-    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
-    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
-    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
-    vmax.u8     d25, d25, d26
-    vmax.u8     d20, d20, d25
-
-    vshr.u8     d23, d23, #1               ; a = a / 2
-    vqadd.u8    d24, d24, d23              ; a = b + a
-
-    vmov.u8     d30, #1
-    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
-
-    vcge.u8     d20, d30, d20              ; flat
-
-    vand        d19, d19, d24              ; mask
-
-    ; hevmask
-    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
-    vorr        d21, d21, d22              ; hev
-
-    vand        d16, d20, d19              ; flat && mask
-    vmov        r5, r6, d16
-
-    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
-    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
-    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
-    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
-    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
-    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
-    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
-    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
-    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
-
-    ; only compare the largest value to thresh
-    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
-    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
-    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
-    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
-
-    vmax.u8     d26, d22, d23
-    vmax.u8     d27, d24, d25
-    vmax.u8     d23, d26, d27
-
-    vcge.u8     d18, d30, d23              ; flat2
-
-    vmov.u8     d22, #0x80
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #1                 ; Only do filter branch
-
-    vand        d17, d18, d16              ; flat2 && flat && mask
-    vmov        r5, r6, d17
-
-    ; mbfilter() function
-
-    ; filter() function
-    ; convert to signed
-    veor        d23, d8, d22               ; qs0
-    veor        d24, d7, d22               ; ps0
-    veor        d25, d6, d22               ; ps1
-    veor        d26, d9, d22               ; qs1
-
-    vmov.u8     d27, #3
-
-    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
-    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
-    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
-    vand        d29, d29, d21              ; filter &= hev
-    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
-    vmov.u8     d29, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d28, q15
-
-    vand        d28, d28, d19              ; filter &= mask
-
-    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
-    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
-    vshr.s8     d30, d30, #3               ; filter2 >>= 3
-    vshr.s8     d29, d29, #3               ; filter1 >>= 3
-
-
-    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
-    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
-
-    ; outer tap adjustments: ++filter1 >> 1
-    vrshr.s8    d29, d29, #1
-    vbic        d29, d29, d21              ; filter &= ~hev
-
-    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
-    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
-
-    veor        d24, d24, d22              ; *f_op0 = u^0x80
-    veor        d23, d23, d22              ; *f_oq0 = u^0x80
-    veor        d25, d25, d22              ; *f_op1 = u^0x80
-    veor        d26, d26, d22              ; *f_oq1 = u^0x80
-
-    tst         r7, #1
-    bxne        lr
-
-    orrs        r5, r5, r6                 ; Check for 0
-    orreq       r7, r7, #2                 ; Only do mbfilter branch
-
-    ; mbfilter flat && mask branch
-    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
-    ; and using vibt on the q's?
-    vmov.u8     d29, #2
-    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
-    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
-    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
-    vaddl.u8    q10, d4, d5
-    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
-    vaddl.u8    q14, d6, d9
-    vqrshrn.u16 d18, q15, #3               ; r_op2
-
-    vsub.i16    q15, q10
-    vaddl.u8    q10, d4, d6
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d7, d10
-    vqrshrn.u16 d19, q15, #3               ; r_op1
-
-    vsub.i16    q15, q10
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d8, d11
-    vqrshrn.u16 d20, q15, #3               ; r_op0
-
-    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
-    vsubw.u8    q15, d7                    ; oq0 -= p0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d9, d11
-    vqrshrn.u16 d21, q15, #3               ; r_oq0
-
-    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
-    vsubw.u8    q15, d8                    ; oq1 -= q0
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d10, d11
-    vqrshrn.u16 d22, q15, #3               ; r_oq1
-
-    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
-    vsubw.u8    q15, d9                    ; oq2 -= q1
-    vadd.i16    q15, q14
-    vqrshrn.u16 d27, q15, #3               ; r_oq2
-
-    ; Filter does not set op2 or oq2, so use p2 and q2.
-    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
-    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
-    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
-    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
-    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
-
-    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
-    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
-
-    tst         r7, #2
-    bxne        lr
-
-    ; wide_mbfilter flat2 && flat && mask branch
-    vmov.u8     d16, #7
-    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
-    vaddl.u8    q12, d2, d3
-    vaddl.u8    q13, d4, d5
-    vaddl.u8    q14, d1, d6
-    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
-    vadd.i16    q12, q13
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vadd.i16    q15, q12
-    vaddl.u8    q12, d0, d1
-    vaddw.u8    q15, d1
-    vaddl.u8    q13, d0, d2
-    vadd.i16    q14, q15, q14
-    vqrshrn.u16 d16, q15, #4               ; w_op6
-
-    vsub.i16    q15, q14, q12
-    vaddl.u8    q14, d3, d10
-    vqrshrn.u16 d24, q15, #4               ; w_op5
-
-    vsub.i16    q15, q13
-    vaddl.u8    q13, d0, d3
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vqrshrn.u16 d25, q15, #4               ; w_op4
-
-    vadd.i16    q15, q14
-    vaddl.u8    q14, d0, d4
-    vsub.i16    q15, q13
-    vsub.i16    q14, q15, q14
-    vqrshrn.u16 d26, q15, #4               ; w_op3
-
-    vaddw.u8    q15, q14, d5               ; op2 += p2
-    vaddl.u8    q14, d0, d5
-    vaddw.u8    q15, d12                   ; op2 += q4
-    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
-    vqrshrn.u16 d27, q15, #4               ; w_op2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d6
-    vaddw.u8    q15, d6                    ; op1 += p1
-    vaddw.u8    q15, d13                   ; op1 += q5
-    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
-    vqrshrn.u16 d18, q15, #4               ; w_op1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d0, d7
-    vaddw.u8    q15, d7                    ; op0 += p0
-    vaddw.u8    q15, d14                   ; op0 += q6
-    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
-    vqrshrn.u16 d19, q15, #4               ; w_op0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d1, d8
-    vaddw.u8    q15, d8                    ; oq0 += q0
-    vaddw.u8    q15, d15                   ; oq0 += q7
-    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
-    vqrshrn.u16 d20, q15, #4               ; w_oq0
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d2, d9
-    vaddw.u8    q15, d9                    ; oq1 += q1
-    vaddl.u8    q4, d10, d15
-    vaddw.u8    q15, d15                   ; oq1 += q7
-    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
-    vqrshrn.u16 d21, q15, #4               ; w_oq1
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d3, d10
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d11, d15
-    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
-    vqrshrn.u16 d22, q15, #4               ; w_oq2
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d4, d11
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d12, d15
-    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
-    vqrshrn.u16 d23, q15, #4               ; w_oq3
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d5, d12
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d13, d15
-    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
-    vqrshrn.u16 d1, q15, #4                ; w_oq4
-
-    vsub.i16    q15, q14
-    vaddl.u8    q14, d6, d13
-    vadd.i16    q15, q4
-    vaddl.u8    q4, d14, d15
-    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
-    vqrshrn.u16 d2, q15, #4                ; w_oq5
-
-    vsub.i16    q15, q14
-    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
-    vadd.i16    q15, q4
-    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
-    vqrshrn.u16 d3, q15, #4                ; w_oq6
-    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
-    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
-    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
-
-    bx          lr
-    ENDP        ; |aom_wide_mbfilter_neon|
-
-    END
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
index c90d6bfde..ee1a3c78f 100644
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,39 +11,690 @@
 
 #include <arm_neon.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+                                 uint8x8_t p0q0, const uint8_t blimit,
+                                 const uint8_t limit) {
+  // Calculate mask values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p3q3, p2q2);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+                                       uint8x8_t p1q1, uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+                                       uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+                                         uint8x8_t p0q0, const uint8_t blimit,
+                                         const uint8_t limit) {
+  // Calculate mask3 values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p2q2, p1q1);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                        uint8x8_t *p0q0, const uint8_t blimit,
+                        const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+      out_f14_pq5;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+  uint8x8_t q0p0, q1p1, q2p2;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  // reverse p and q
+  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    // filter 14
+    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+    uint8x8_t qp_rev;
+
+    out = vaddw_u8(out, *p4q4);
+    out = vaddw_u8(out, *p5q5);
+    out = vaddw_u8(out, *p6q6);
+
+    out_pq5 = vaddw_u8(out, *p4q4);
+    out_pq4 = vaddw_u8(out_pq5, *p3q3);
+    out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+    out_pq5 = vaddw_u8(out_pq5, *p5q5);
+    out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+    out_pq0 = vaddw_u8(out, *p1q1);
+    out_pq1 = vaddw_u8(out_pq0, *p2q2);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+    out_pq0 = vaddw_u8(out_pq0, *p0q0);
+    out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+    out_pq1 = vaddw_u8(out_pq1, *p6q6);
+    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+    out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+    qp_sum = vaddl_u8(q2p2, q1p1);
+    out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+    out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+    // filter14 outputs
+    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+  }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                       uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
 
-void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    uint8x8_t q0p0, q1p1, q2p2;
+
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    // reverse p and q
+    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+  }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+                       const uint8_t blimit, const uint8_t limit,
+                       const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f6_pq0, out_f6_pq1;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 6
+    uint16x8_t out_pq0, out_pq1;
+    uint8x8_t pq_rev;
+
+    out = vaddl_u8(*p0q0, *p1q1);
+    out = vaddq_u16(out, out);
+    out = vaddw_u8(out, *p2q2);
+
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    out = vaddw_u8(out, pq_rev);
+
+    out_pq0 = vaddw_u8(out, pq_rev);
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+    out_pq1 = vaddw_u8(out, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter6_cond;
+    filter6_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter6_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter6 outputs
+    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+  }
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  uint8x16_t row0, row1, row2, row3;
+  uint8x8_t pxp3, p6p2, p5p1, p4p0;
+  uint8x8_t q0q4, q1q5, q2q6, q3qy;
+  uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+  uint32x2_t pq_rev;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+  // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3);
+
+  pxp3 = vget_low_u8(row0);
+  p6p2 = vget_low_u8(row1);
+  p5p1 = vget_low_u8(row2);
+  p4p0 = vget_low_u8(row3);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  q0q4 = vget_high_u8(row0);
+  q1q5 = vget_high_u8(row1);
+  q2q6 = vget_high_u8(row2);
+  q3qy = vget_high_u8(row3);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+  pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+  pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+  p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+  p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+  p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+  q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+  p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+  p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  row0 = vcombine_u8(pxp3, q0q4);
+  row1 = vcombine_u8(p6p2, q1q5);
+  row2 = vcombine_u8(p5p1, q2q6);
+  row3 = vcombine_u8(p4p0, q3qy);
+
+  store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
 }
 
-#if HAVE_NEON_ASM
-void aom_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t p3q0, p2q1, p1q2, p0q3;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+  load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
 }
 
-void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+  p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+                                           vreinterpret_u32_u8(p3q3), 1));
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+  vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
 }
 
-void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2;
+
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
 }
-#endif  // HAVE_NEON_ASM
diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c
index a1eeaf4b7..606950ab2 100644
--- a/third_party/aom/aom_dsp/arm/sad4d_neon.c
+++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c
@@ -11,8 +11,9 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
index 2f452f55b..a39de91d6 100644
--- a/third_party/aom/aom_dsp/arm/sad_neon.c
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -11,7 +11,7 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm
deleted file mode 100644
index e04969823..000000000
--- a/third_party/aom/aom_dsp/arm/save_reg_neon.asm
+++ /dev/null
@@ -1,39 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_push_neon|
-    EXPORT  |aom_pop_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_push_neon| PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-|aom_pop_neon| PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-    END
-
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
index 064b72d6f..44d821821 100644
--- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -10,8 +10,9 @@
  */
 
 #include <arm_neon.h>
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
 
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
index cb8a2daf8..28f5ace8e 100644
--- a/third_party/aom/aom_dsp/arm/subtract_neon.c
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -11,7 +11,8 @@
 
 #include <arm_neon.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
index dbab287e3..74385a601 100644
--- a/third_party/aom/aom_dsp/arm/variance_neon.c
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -11,8 +11,8 @@
 
 #include <arm_neon.h>
 
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c
deleted file mode 100644
index f732224fd..000000000
--- a/third_party/aom/aom_dsp/avg.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <stdlib.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
-
-// src_diff: first pass, 9 bit, dynamic range [-255, 255]
-//           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
-  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
-  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
-  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
-  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
-  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
-  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
-  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
-  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
-
-  int16_t c0 = b0 + b2;
-  int16_t c1 = b1 + b3;
-  int16_t c2 = b0 - b2;
-  int16_t c3 = b1 - b3;
-  int16_t c4 = b4 + b6;
-  int16_t c5 = b5 + b7;
-  int16_t c6 = b4 - b6;
-  int16_t c7 = b5 - b7;
-
-  coeff[0] = c0 + c4;
-  coeff[7] = c1 + c5;
-  coeff[3] = c2 + c6;
-  coeff[4] = c3 + c7;
-  coeff[2] = c0 - c4;
-  coeff[6] = c1 - c5;
-  coeff[1] = c2 - c6;
-  coeff[5] = c3 - c7;
-}
-
-// The order of the output coeff of the hadamard is not important. For
-// optimization purposes the final transpose may be skipped.
-void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
-                        int16_t *coeff) {
-  int idx;
-  int16_t buffer[64];
-  int16_t *tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
-                                                   // dynamic range [-255, 255]
-    tmp_buf += 8;
-    ++src_diff;
-  }
-
-  tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
-                                       // dynamic range [-2040, 2040]
-    coeff += 8;                        // coeff: 15 bit
-                                       // dynamic range [-16320, 16320]
-    ++tmp_buf;
-  }
-}
-
-// In place 16x16 2D Hadamard transform
-void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  // coeff: 15 bit, dynamic range [-16320, 16320]
-  for (idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
-
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
-
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64] = b1 + b3;
-    coeff[128] = b0 - b2;
-    coeff[192] = b1 - b3;
-
-    ++coeff;
-  }
-}
-
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
-int aom_satd_c(const int16_t *coeff, int length) {
-  int i;
-  int satd = 0;
-  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
-
-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return satd;
-}
-
-// Integer projection onto row vectors.
-// height: value range {16, 32, 64}.
-void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int ref_stride,
-                       int height) {
-  int idx;
-  const int norm_factor = height >> 1;
-  for (idx = 0; idx < 16; ++idx) {
-    int i;
-    hbuf[idx] = 0;
-    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
-    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
-    // hbuf[idx]: 9 bit, dynamic range [0, 510].
-    hbuf[idx] /= norm_factor;
-    ++ref;
-  }
-}
-
-// width: value range {16, 32, 64}.
-int16_t aom_int_pro_col_c(const uint8_t *ref, int width) {
-  int idx;
-  int16_t sum = 0;
-  // sum: 14 bit, dynamic range [0, 16320]
-  for (idx = 0; idx < width; ++idx) sum += ref[idx];
-  return sum;
-}
-
-// ref: [0 - 510]
-// src: [0 - 510]
-// bwl: {2, 3, 4}
-int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
-  int i;
-  int width = 4 << bwl;
-  int sse = 0, mean = 0, var;
-
-  for (i = 0; i < width; ++i) {
-    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
-    mean += diff;                // mean: dynamic range 16 bits.
-    sse += diff * diff;          // sse:  dynamic range 26 bits.
-  }
-
-  // (mean * mean): dynamic range 31 bits.
-  var = sse - ((mean * mean) >> (bwl + 2));
-  return var;
-}
-
-void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
-                      int ref_stride, int *min, int *max) {
-  int i, j;
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(src[j] - ref[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
-                             int dp, int *min, int *max) {
-  int i, j;
-  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j] - d[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
index 4f38afbc5..d05c3efdc 100644
--- a/third_party/aom/aom_dsp/binary_codes_reader.c
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -33,17 +33,6 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
   }
 }
 
-int16_t aom_read_primitive_symmetric_(aom_reader *r,
-                                      unsigned int mag_bits ACCT_STR_PARAM) {
-  if (aom_read_bit(r, ACCT_STR_NAME)) {
-    int s = aom_read_bit(r, ACCT_STR_NAME);
-    int16_t x = aom_read_literal(r, mag_bits, ACCT_STR_NAME) + 1;
-    return (s > 0 ? -x : x);
-  } else {
-    return 0;
-  }
-}
-
 uint16_t aom_read_primitive_quniform_(aom_reader *r,
                                       uint16_t n ACCT_STR_PARAM) {
   if (n <= 1) return 0;
@@ -62,76 +51,56 @@ static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
   return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
 }
 
-uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p,
-                                        uint16_t ref ACCT_STR_PARAM) {
-  if (n <= 1) return 0;
-  assert(p > 0 && p <= n);
-  assert(ref < n);
-  int lolimit = ref - p / 2;
-  const int hilimit = lolimit + p - 1;
-  if (lolimit < 0) {
-    lolimit = 0;
-  } else if (hilimit >= n) {
-    lolimit = n - p;
-  }
-  int v;
-  if (aom_read_bit(r, ACCT_STR_NAME)) {
-    v = aom_read_primitive_quniform(r, p, ACCT_STR_NAME) + lolimit;
-  } else {
-    v = aom_read_primitive_quniform(r, n - p, ACCT_STR_NAME);
-    if (v >= lolimit) v += p;
-  }
-  return v;
-}
-
 // Decode finite subexponential code that for a symbol v in [0, n-1] with
 // parameter k
 uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
                                        uint16_t k ACCT_STR_PARAM) {
   int i = 0;
   int mk = 0;
-  uint16_t v;
+
   while (1) {
     int b = (i ? k + i - 1 : k);
     int a = (1 << b);
+
     if (n <= mk + 3 * a) {
-      v = aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
-      break;
-    } else {
-      if (aom_read_bit(r, ACCT_STR_NAME)) {
-        i = i + 1;
-        mk += a;
-      } else {
-        v = aom_read_literal(r, b, ACCT_STR_NAME) + mk;
-        break;
-      }
+      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+    }
+
+    if (!aom_read_bit(r, ACCT_STR_NAME)) {
+      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
     }
+
+    i = i + 1;
+    mk += a;
   }
-  return v;
+
+  assert(0);
+  return 0;
 }
 
 static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
                                                 uint16_t n, uint16_t k) {
   int i = 0;
   int mk = 0;
-  uint16_t v;
+
   while (1) {
     int b = (i ? k + i - 1 : k);
     int a = (1 << b);
+
     if (n <= mk + 3 * a) {
-      v = aom_rb_read_primitive_quniform(rb, n - mk) + mk;
-      break;
-    } else {
-      if (aom_rb_read_bit(rb)) {
-        i = i + 1;
-        mk += a;
-      } else {
-        v = aom_rb_read_literal(rb, b) + mk;
-        break;
-      }
+      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
     }
+
+    if (!aom_rb_read_bit(rb)) {
+      return aom_rb_read_literal(rb, b) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
   }
-  return v;
+
+  assert(0);
+  return 0;
 }
 
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
@@ -146,20 +115,19 @@ static uint16_t aom_rb_read_primitive_refsubexpfin(
                                     aom_rb_read_primitive_subexpfin(rb, n, k));
 }
 
-// Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with
-// parameter k based on a reference ref also in [-(n-1), n-1].
-int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
-                                                uint16_t k,
-                                                int16_t ref ACCT_STR_PARAM) {
-  ref += n - 1;
-  const uint16_t scaled_n = (n << 1) - 1;
-  return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) -
-         n + 1;
-}
-
 int16_t aom_rb_read_signed_primitive_refsubexpfin(
     struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
   ref += n - 1;
   const uint16_t scaled_n = (n << 1) - 1;
   return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
 }
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+  int leading_zeros = 0;
+  while (!aom_rb_read_bit(rb)) ++leading_zeros;
+  // Maximum 32 bits.
+  if (leading_zeros >= 32) return UINT32_MAX;
+  const uint32_t base = (1u << leading_zeros) - 1;
+  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+  return base + value;
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
index 8885142c9..5253c6154 100644
--- a/third_party/aom/aom_dsp/binary_codes_reader.h
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -18,40 +18,30 @@ extern "C" {
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitreader.h"
 #include "aom_dsp/bitreader_buffer.h"
 
-#define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \
-  aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
   aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_refbilevel(r, n, p, ref, ACCT_STR_NAME) \
-  aom_read_primitive_refbilevel_(r, n, p, ref ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
   aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
   aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_signed_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
-  aom_read_signed_primitive_refsubexpfin_(r, n, k,                          \
-                                          ref ACCT_STR_ARG(ACCT_STR_NAME))
 
-int16_t aom_read_primitive_symmetric_(aom_reader *r,
-                                      unsigned int mag_bits ACCT_STR_PARAM);
 uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
-uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p,
-                                        uint16_t ref ACCT_STR_PARAM);
 uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
                                        uint16_t k ACCT_STR_PARAM);
 uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
                                           uint16_t ref ACCT_STR_PARAM);
-int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n,
-                                                uint16_t k,
-                                                int16_t ref ACCT_STR_PARAM);
 
 int16_t aom_rb_read_signed_primitive_refsubexpfin(
     struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
index e092b6278..8f74f0942 100644
--- a/third_party/aom/aom_dsp/binary_codes_writer.c
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -89,61 +89,6 @@ int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
   return v < m ? l - 1 : l;
 }
 
-// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1]
-// The closest p values of v from ref are coded using a p-ary quasi-unoform
-// short code while the remaining n-p values are coded with a longer code.
-void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p,
-                                    uint16_t ref, uint16_t v) {
-  if (n <= 1) return;
-  assert(p > 0 && p <= n);
-  assert(ref < n);
-  int lolimit = ref - p / 2;
-  int hilimit = lolimit + p - 1;
-  if (lolimit < 0) {
-    lolimit = 0;
-    hilimit = p - 1;
-  } else if (hilimit >= n) {
-    hilimit = n - 1;
-    lolimit = n - p;
-  }
-  if (v >= lolimit && v <= hilimit) {
-    aom_write_bit(w, 1);
-    v = v - lolimit;
-    aom_write_primitive_quniform(w, p, v);
-  } else {
-    aom_write_bit(w, 0);
-    if (v > hilimit) v -= p;
-    aom_write_primitive_quniform(w, n - p, v);
-  }
-}
-
-int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref,
-                                   uint16_t v) {
-  if (n <= 1) return 0;
-  assert(p > 0 && p <= n);
-  assert(ref < n);
-  int lolimit = ref - p / 2;
-  int hilimit = lolimit + p - 1;
-  if (lolimit < 0) {
-    lolimit = 0;
-    hilimit = p - 1;
-  } else if (hilimit >= n) {
-    hilimit = n - 1;
-    lolimit = n - p;
-  }
-  int count = 0;
-  if (v >= lolimit && v <= hilimit) {
-    count++;
-    v = v - lolimit;
-    count += aom_count_primitive_quniform(p, v);
-  } else {
-    count++;
-    if (v > hilimit) v -= p;
-    count += aom_count_primitive_quniform(n - p, v);
-  }
-  return count;
-}
-
 // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
 void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
                                    uint16_t v) {
@@ -263,3 +208,15 @@ int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
   const uint16_t scaled_n = (n << 1) - 1;
   return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
 }
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+  int64_t shift_val = ++v;
+  int leading_zeroes = 1;
+
+  assert(shift_val > 0);
+
+  while (shift_val >>= 1) leading_zeroes += 2;
+
+  aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+  aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
index 18ad5078f..784c721a6 100644
--- a/third_party/aom/aom_dsp/binary_codes_writer.h
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -17,7 +17,8 @@ extern "C" {
 #endif
 
 #include <assert.h>
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/bitwriter.h"
 #include "aom_dsp/bitwriter_buffer.h"
@@ -33,12 +34,6 @@ void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
 // Encodes a value v in [0, n-1] quasi-uniformly
 void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
 
-// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1]
-// The closest p values of v from ref are coded using a p-ary quasi-unoform
-// short code while the remaining n-p values are coded with a longer code.
-void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p,
-                                    uint16_t ref, uint16_t v);
-
 // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
 void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
                                    uint16_t v);
@@ -61,13 +56,12 @@ void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
 // Functions that counts bits for the above primitives
 int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
 int aom_count_primitive_quniform(uint16_t n, uint16_t v);
-int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref,
-                                   uint16_t v);
 int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
 int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
                                      uint16_t v);
 int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
                                             int16_t v);
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
index 00424fa76..328935be9 100644
--- a/third_party/aom/aom_dsp/bitreader.h
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -15,15 +15,11 @@
 #include <assert.h>
 #include <limits.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
-#if CONFIG_ANS
-#include "aom_dsp/ansreader.h"
-#else
 #include "aom_dsp/daalaboolreader.h"
-#endif
 #include "aom_dsp/prob.h"
 #include "av1/common/odintrin.h"
 
@@ -50,72 +46,37 @@
 #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
   aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
 
-#if CONFIG_LV_MAP
-#define aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_bin_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#if CONFIG_ANS
-typedef struct AnsDecoder aom_reader;
-#else
 typedef struct daala_reader aom_reader;
-#endif
 
 static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
-                                  size_t size, aom_decrypt_cb decrypt_cb,
-                                  void *decrypt_state) {
-  (void)decrypt_cb;
-  (void)decrypt_state;
-#if CONFIG_ANS
-  if (size > INT_MAX) return 1;
-  return ans_read_init(r, buffer, (int)size);
-#else
+                                  size_t size) {
   return aom_daala_reader_init(r, buffer, (int)size);
-#endif
+}
+
+static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) {
+  return aom_daala_reader_find_begin(r);
 }
 
 static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "Use the raw buffer size with ANS");
-  return NULL;
-#else
   return aom_daala_reader_find_end(r);
-#endif
 }
 
 static INLINE int aom_reader_has_error(aom_reader *r) {
-#if CONFIG_ANS
-  return ans_reader_has_error(r);
-#else
   return aom_daala_reader_has_error(r);
-#endif
 }
 
 // Returns the position in the bit reader in bits.
 static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "aom_reader_tell() is unimplemented for ANS");
-  return 0;
-#else
   return aom_daala_reader_tell(r);
-#endif
 }
 
 // Returns the position in the bit reader in 1/8th bits.
 static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
-  return 0;
-#else
   return aom_daala_reader_tell_frac(r);
-#endif
 }
 
 #if CONFIG_ACCOUNTING
@@ -139,11 +100,7 @@ static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
 
 static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
-  ret = rabs_read(r, prob);
-#else
   ret = aom_daala_read(r, prob);
-#endif
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
   aom_update_symb_counts(r, 1);
@@ -153,15 +110,7 @@ static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
 
 static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
-  ret = rabs_read_bit(r);  // Non trivial optimization at half probability
-#elif CONFIG_RAWBITS
-  // Note this uses raw bits and is not the same as aom_daala_read(r, 128);
-  // Calls to this function are omitted from raw symbol accounting.
-  ret = aom_daala_read_bit(r);
-#else
   ret = aom_read(r, 128, NULL);  // aom_prob_half
-#endif
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
@@ -181,12 +130,7 @@ static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
 static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
                                 int nsymbs ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
-  (void)nsymbs;
-  ret = rans_read(r, cdf);
-#else
   ret = daala_read_symbol(r, cdf, nsymbs);
-#endif
 
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
@@ -199,46 +143,7 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
                                    int nsymbs ACCT_STR_PARAM) {
   int ret;
   ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
-  update_cdf(cdf, ret, nsymbs);
-  return ret;
-}
-
-#if CONFIG_LV_MAP
-static INLINE int aom_read_bin_(aom_reader *r, aom_cdf_prob *cdf,
-                                int nsymbs ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
-  update_cdf(cdf, ret, nsymbs);
-  return ret;
-}
-#endif
-
-static INLINE int aom_read_tree_as_cdf(aom_reader *r,
-                                       const aom_tree_index *tree,
-                                       const aom_prob *probs) {
-  aom_tree_index i = 0;
-  do {
-    aom_cdf_prob cdf[16];
-    aom_tree_index index[16];
-    int path[16];
-    int dist[16];
-    int nsymbs;
-    int symb;
-    nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist);
-    symb = aom_read_cdf(r, cdf, nsymbs, NULL);
-    OD_ASSERT(symb >= 0 && symb < nsymbs);
-    i = index[symb];
-  } while (i > 0);
-  return -i;
-}
-
-static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
-                                 const aom_prob *probs ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_read_tree_as_cdf(r, tree, probs);
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
+  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
   return ret;
 }
 
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
index e51b1cc3a..68fc381f2 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -8,8 +8,9 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include "./aom_config.h"
-#include "./bitreader_buffer.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitreader_buffer.h"
 
 size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
   return (rb->bit_offset + 7) >> 3;
@@ -35,9 +36,13 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
   return value;
 }
 
-int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
-  const int value = aom_rb_read_literal(rb, bits);
-  return aom_rb_read_bit(rb) ? -value : value;
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
+                                      int bits) {
+  uint32_t value = 0;
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
+  return value;
 }
 
 int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
index 22187357e..2dafe11ad 100644
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -37,7 +37,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
 
 int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
 
-int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
 
 int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
 
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
index 7d3b34306..de1b1d048 100644
--- a/third_party/aom/aom_dsp/bitwriter.h
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -13,13 +13,10 @@
 #define AOM_DSP_BITWRITER_H_
 
 #include <assert.h>
-#include "./aom_config.h"
 
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#else
+#include "config/aom_config.h"
+
 #include "aom_dsp/daalaboolwriter.h"
-#endif
 #include "aom_dsp/prob.h"
 
 #if CONFIG_RD_DEBUG
@@ -31,23 +28,16 @@
 extern "C" {
 #endif
 
-#if CONFIG_ANS
-typedef struct BufAnsCoder aom_writer;
-#else
 typedef struct daala_writer aom_writer;
-#endif
 
 typedef struct TOKEN_STATS {
   int cost;
-#if CONFIG_VAR_TX
 #if CONFIG_RD_DEBUG
   int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
 #endif
-#endif
 } TOKEN_STATS;
 
 static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
-#if CONFIG_VAR_TX
 #if CONFIG_RD_DEBUG
   int r, c;
   for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
@@ -56,65 +46,23 @@ static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
     }
   }
 #endif
-#endif
   token_stats->cost = 0;
 }
 
 static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
-#if CONFIG_ANS
-  aom_buf_ans_alloc(bc, /* error context*/ NULL);
-  buf_ans_write_init(bc, buffer);
-#else
   aom_daala_start_encode(bc, buffer);
-#endif
 }
 
-static INLINE void aom_stop_encode(aom_writer *bc) {
-#if CONFIG_ANS
-  aom_buf_ans_flush(bc);
-  bc->pos = buf_ans_write_end(bc);
-#else
-  aom_daala_stop_encode(bc);
-#endif
+static INLINE int aom_stop_encode(aom_writer *bc) {
+  return aom_daala_stop_encode(bc);
 }
 
 static INLINE void aom_write(aom_writer *br, int bit, int probability) {
-#if CONFIG_ANS
-  buf_rabs_write(br, bit, probability);
-#else
   aom_daala_write(br, bit, probability);
-#endif
-}
-
-static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
-                                    TOKEN_STATS *token_stats) {
-  aom_write(br, bit, probability);
-#if CONFIG_RD_DEBUG
-  token_stats->cost += av1_cost_bit(probability, bit);
-#else
-  (void)token_stats;
-#endif
 }
 
 static INLINE void aom_write_bit(aom_writer *w, int bit) {
-#if CONFIG_ANS
-  buf_rabs_write_bit(w, bit);
-#elif CONFIG_RAWBITS
-  // Note this uses raw bits and is not the same as aom_daala_write(r, 128);
-  aom_daala_write_bit(w, bit);
-#else
   aom_write(w, bit, 128);  // aom_prob_half
-#endif
-}
-
-static INLINE void aom_write_bit_record(aom_writer *w, int bit,
-                                        TOKEN_STATS *token_stats) {
-  aom_write_bit(w, bit);
-#if CONFIG_RD_DEBUG
-  token_stats->cost += av1_cost_bit(128, bit);  // aom_prob_half
-#else
-  (void)token_stats;
-#endif
 }
 
 static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
@@ -125,83 +73,13 @@ static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
 
 static INLINE void aom_write_cdf(aom_writer *w, int symb,
                                  const aom_cdf_prob *cdf, int nsymbs) {
-#if CONFIG_ANS
-  (void)nsymbs;
-  assert(cdf);
-  const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0;
-  const aom_cdf_prob prob = cdf[symb] - cum_prob;
-  buf_rans_write(w, cum_prob, prob);
-#else
   daala_write_symbol(w, symb, cdf, nsymbs);
-#endif
 }
 
 static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
                                     int nsymbs) {
   aom_write_cdf(w, symb, cdf, nsymbs);
-  update_cdf(cdf, symb, nsymbs);
-}
-
-#if CONFIG_LV_MAP
-static INLINE void aom_write_bin(aom_writer *w, int symb, aom_cdf_prob *cdf,
-                                 int nsymbs) {
-  aom_write_cdf(w, symb, cdf, nsymbs);
-  update_cdf(cdf, symb, nsymbs);
-}
-#endif
-
-static INLINE void aom_write_tree_as_cdf(aom_writer *w,
-                                         const aom_tree_index *tree,
-                                         const aom_prob *probs, int bits,
-                                         int len, aom_tree_index i) {
-  aom_tree_index root;
-  root = i;
-  do {
-    aom_cdf_prob cdf[16];
-    aom_tree_index index[16];
-    int path[16];
-    int dist[16];
-    int nsymbs;
-    int symb;
-    int j;
-    /* Compute the CDF of the binary tree using the given probabilities. */
-    nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist);
-    /* Find the symbol to code. */
-    symb = -1;
-    for (j = 0; j < nsymbs; j++) {
-      /* If this symbol codes a leaf node,  */
-      if (index[j] <= 0) {
-        if (len == dist[j] && path[j] == bits) {
-          symb = j;
-          break;
-        }
-      } else {
-        if (len > dist[j] && path[j] == bits >> (len - dist[j])) {
-          symb = j;
-          break;
-        }
-      }
-    }
-    OD_ASSERT(symb != -1);
-    aom_write_cdf(w, symb, cdf, nsymbs);
-    bits &= (1 << (len - dist[symb])) - 1;
-    len -= dist[symb];
-  } while (len);
-}
-
-static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
-                                  const aom_prob *probs, int bits, int len,
-                                  aom_tree_index i) {
-  aom_write_tree_as_cdf(w, tree, probs, bits, len, i);
-}
-
-static INLINE void aom_write_tree_record(aom_writer *w,
-                                         const aom_tree_index *tree,
-                                         const aom_prob *probs, int bits,
-                                         int len, aom_tree_index i,
-                                         TOKEN_STATS *token_stats) {
-  (void)token_stats;
-  aom_write_tree_as_cdf(w, tree, probs, bits, len, i);
+  if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
 }
 
 #ifdef __cplusplus
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
index 1b3dd2913..21314eb2a 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -12,8 +12,13 @@
 #include <limits.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./bitwriter_buffer.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitwriter_buffer.h"
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
+  return (wb->bit_offset % CHAR_BIT == 0);
+}
 
 uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
@@ -48,6 +53,12 @@ void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
   for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }
 
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
 void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
                               int bits) {
   int bit;
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
index 1f23dc857..f7f75a097 100644
--- a/third_party/aom/aom_dsp/bitwriter_buffer.h
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -23,6 +23,8 @@ struct aom_write_bit_buffer {
   uint32_t bit_offset;
 };
 
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
+
 uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
 
 void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
@@ -31,6 +33,9 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
 
 void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
 
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits);
+
 void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
                               int bits);
 
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
index e5297ff83..434bb83a1 100644
--- a/third_party/aom/aom_dsp/blend.h
+++ b/third_party/aom/aom_dsp/blend.h
@@ -39,4 +39,7 @@
 // Blending by averaging.
 #define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
 
+#define DIFF_FACTOR_LOG2 4
+#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
+
 #endif  // AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
index 99b4b8a59..0554b43d1 100644
--- a/third_party/aom/aom_dsp/blend_a64_hmask.c
+++ b/third_party/aom/aom_dsp/blend_a64_hmask.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w) {
+                           const uint8_t *mask, int w, int h) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -40,11 +40,10 @@ void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int h, int w, int bd) {
+                                  const uint8_t *mask, int w, int h, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -68,4 +67,3 @@ void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
index c35fa19f8..992cc5c0c 100644
--- a/third_party/aom/aom_dsp/blend_a64_mask.c
+++ b/third_party/aom/aom_dsp/blend_a64_mask.c
@@ -16,70 +16,209 @@
 #include "aom_dsp/blend.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
-#if CONFIG_CONVOLVE_ROUND
 // Blending with alpha mask. Mask values come from the range [0, 64],
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
 // be the same as dst, or dst can be different from both sources.
 
-void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
-                              const int32_t *src0, uint32_t src0_stride,
-                              const int32_t *src1, uint32_t src1_stride,
-                              const uint8_t *mask, uint32_t mask_stride, int h,
-                              int w, int subh, int subw) {
+// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d32 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
+void aom_lowbd_blend_a64_d16_mask_c(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
   int i, j;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
 
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
 
-  assert(h >= 1);
-  assert(w >= 1);
+  assert(h >= 4);
+  assert(w >= 4);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
   if (subw == 0 && subh == 0) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
       }
     }
   } else if (subw == 1 && subh == 1) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = ROUND_POWER_OF_TWO(
             mask[(2 * i) * mask_stride + (2 * j)] +
                 mask[(2 * i + 1) * mask_stride + (2 * j)] +
                 mask[(2 * i) * mask_stride + (2 * j + 1)] +
                 mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
             2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
       }
     }
   } else if (subw == 1 && subh == 0) {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
                                     mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
       }
     }
   } else {
     for (i = 0; i < h; ++i) {
       for (j = 0; j < w; ++j) {
+        int32_t res;
         const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
                                     mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
+        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
+                         (AOM_BLEND_A64_MAX_ALPHA - m) *
+                             (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_c(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  // excerpt from clip_pixel_highbd()
+  // set saturation_value to (1 << bd) - 1
+  unsigned int saturation_value;
+  switch (bd) {
+    case 8:
+    default: saturation_value = 255; break;
+    case 10: saturation_value = 1023; break;
+    case 12: saturation_value = 4095; break;
+  }
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[j];
+        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
+                mask[mask_stride + 2 * j + 1],
+            2);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
       }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
     }
   }
 }
-#endif  // CONFIG_CONVOLVE_ROUND
 
 // Blending with alpha mask. Mask values come from the range [0, 64],
 // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
@@ -88,8 +227,8 @@ void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride,
 void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
                           const uint8_t *src0, uint32_t src0_stride,
                           const uint8_t *src1, uint32_t src1_stride,
-                          const uint8_t *mask, uint32_t mask_stride, int h,
-                          int w, int subh, int subw) {
+                          const uint8_t *mask, uint32_t mask_stride, int w,
+                          int h, int subw, int subh) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -142,12 +281,11 @@ void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
                                  const uint8_t *src0_8, uint32_t src0_stride,
                                  const uint8_t *src1_8, uint32_t src1_stride,
                                  const uint8_t *mask, uint32_t mask_stride,
-                                 int h, int w, int subh, int subw, int bd) {
+                                 int w, int h, int subw, int subh, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -205,4 +343,3 @@ void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
index 1a5e30e31..4f222e17f 100644
--- a/third_party/aom/aom_dsp/blend_a64_vmask.c
+++ b/third_party/aom/aom_dsp/blend_a64_vmask.c
@@ -16,12 +16,12 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w) {
+                           const uint8_t *mask, int w, int h) {
   int i, j;
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
@@ -41,11 +41,10 @@ void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
                                   const uint8_t *src0_8, uint32_t src0_stride,
                                   const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int h, int w, int bd) {
+                                  const uint8_t *mask, int w, int h, int bd) {
   int i, j;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
   const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -70,4 +69,3 @@ void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
index f84ff3aed..cf7df1dbf 100644
--- a/third_party/aom/aom_dsp/buf_ans.h
+++ b/third_party/aom/aom_dsp/buf_ans.h
@@ -16,7 +16,8 @@
 // backwards due to ANS's stack like behavior.
 
 #include <assert.h>
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/ans.h"
 #include "aom_dsp/answriter.h"
@@ -47,6 +48,7 @@ struct BufAnsCoder {
   int window_size;
 #endif
   int pos;  // Dummy variable to store the output buffer after closing
+  uint8_t allow_update_cdf;
 };
 
 // Allocate a buffered ANS coder to store size symbols.
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
index c6e3ac82d..4e224904e 100644
--- a/third_party/aom/aom_dsp/daalaboolreader.c
+++ b/third_party/aom/aom_dsp/daalaboolreader.c
@@ -24,6 +24,10 @@ int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
   return 0;
 }
 
+const uint8_t *aom_daala_reader_find_begin(daala_reader *r) {
+  return r->buffer;
+}
+
 const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
   return r->buffer_end;
 }
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
index 55ff8d3d5..60c197a49 100644
--- a/third_party/aom/aom_dsp/daalaboolreader.h
+++ b/third_party/aom/aom_dsp/daalaboolreader.h
@@ -34,11 +34,13 @@ struct daala_reader {
 #if CONFIG_ACCOUNTING
   Accounting *accounting;
 #endif
+  uint8_t allow_update_cdf;
 };
 
 typedef struct daala_reader daala_reader;
 
 int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
+const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
 const uint8_t *aom_daala_reader_find_end(daala_reader *r);
 uint32_t aom_daala_reader_tell(const daala_reader *r);
 uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
@@ -96,12 +98,6 @@ static INLINE int aom_daala_read(daala_reader *r, int prob) {
   return bit;
 }
 
-#if CONFIG_RAWBITS
-static INLINE int aom_daala_read_bit(daala_reader *r) {
-  return od_ec_dec_bits(&r->ec, 1, "aom_bits");
-}
-#endif
-
 static INLINE int aom_daala_reader_has_error(daala_reader *r) {
   return r->ec.error;
 }
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c
index 59af2a243..b24ffbf3f 100644
--- a/third_party/aom/aom_dsp/daalaboolwriter.c
+++ b/third_party/aom/aom_dsp/daalaboolwriter.c
@@ -18,11 +18,14 @@ void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
   od_ec_enc_init(&br->ec, 62025);
 }
 
-void aom_daala_stop_encode(daala_writer *br) {
+int aom_daala_stop_encode(daala_writer *br) {
+  int nb_bits;
   uint32_t daala_bytes;
   unsigned char *daala_data;
   daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
+  nb_bits = od_ec_enc_tell(&br->ec);
   memcpy(br->buffer, daala_data, daala_bytes);
   br->pos = daala_bytes;
   od_ec_enc_clear(&br->ec);
+  return nb_bits;
 }
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
index 6ec0f0b54..f9c596c73 100644
--- a/third_party/aom/aom_dsp/daalaboolwriter.h
+++ b/third_party/aom/aom_dsp/daalaboolwriter.h
@@ -28,12 +28,13 @@ struct daala_writer {
   unsigned int pos;
   uint8_t *buffer;
   od_ec_enc ec;
+  uint8_t allow_update_cdf;
 };
 
 typedef struct daala_writer daala_writer;
 
 void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
-void aom_daala_stop_encode(daala_writer *w);
+int aom_daala_stop_encode(daala_writer *w);
 
 static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
   int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
@@ -53,12 +54,6 @@ static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
   od_ec_encode_bool_q15(&w->ec, bit, p);
 }
 
-#if CONFIG_RAWBITS
-static INLINE void aom_daala_write_bit(daala_writer *w, int bit) {
-  od_ec_enc_bits(&w->ec, bit, 1);
-}
-#endif
-
 static INLINE void daala_write_symbol(daala_writer *w, int symb,
                                       const aom_cdf_prob *cdf, int nsymbs) {
 #if CONFIG_BITSTREAM_DEBUG
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
index ad76b7e3e..aad96c6fc 100644
--- a/third_party/aom/aom_dsp/entcode.c
+++ b/third_party/aom/aom_dsp/entcode.c
@@ -9,10 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
 #include "aom_dsp/entcode.h"
 
 /*Given the current total integer number of bits used and the current value of
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
index 981a951e6..5c15526e9 100644
--- a/third_party/aom/aom_dsp/entcode.h
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -9,11 +9,16 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#if !defined(_entcode_H)
-#define _entcode_H (1)
+#ifndef AOM_DSP_ENTCODE_H_
+#define AOM_DSP_ENTCODE_H_
+
 #include <limits.h>
 #include <stddef.h>
 #include "av1/common/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
 
 /*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
    on a larger type, you can speed up the decoder by using it here.*/
@@ -21,22 +26,15 @@ typedef uint32_t od_ec_window;
 
 #define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
 
-/*The number of bits to use for the range-coded part of unsigned integers.*/
-#define OD_EC_UINT_BITS (4)
-
 /*The resolution of fractional-precision bit usage measurements, i.e.,
    3 => 1/8th bits.*/
 #define OD_BITRES (3)
 
-/*The value stored in an iCDF is 32768 minus the actual Q15 cumulative
-   probability (an "inverse" CDF).
-  This function converts from one representation to the other (and is its own
-   inverse).*/
-#define OD_ICDF(x) (32768U - (x))
+#define OD_ICDF AOM_ICDF
 
 /*See entcode.c for further documentation.*/
 
 OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
                                                uint32_t rng);
 
-#endif
+#endif  // AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
index 71dad0df6..b8e9078c3 100644
--- a/third_party/aom/aom_dsp/entdec.c
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -9,11 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
+#include <assert.h>
 #include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
 
 /*A range decoder.
   This is an entropy decoder based upon \cite{Mar79}, which is itself a
@@ -75,6 +73,8 @@
   Even relatively modest values like 100 would work fine.*/
 #define OD_EC_LOTS_OF_BITS (0x4000)
 
+/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
+   call.*/
 static void od_ec_dec_refill(od_ec_dec *dec) {
   int s;
   od_ec_window dif;
@@ -87,7 +87,7 @@ static void od_ec_dec_refill(od_ec_dec *dec) {
   end = dec->end;
   s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
   for (; s >= 0 && bptr < end; s -= 8, bptr++) {
-    OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
+    assert(s <= OD_EC_WINDOW_SIZE - 8);
     dif ^= (od_ec_window)bptr[0] << s;
     cnt += 8;
   }
@@ -111,7 +111,7 @@ static void od_ec_dec_refill(od_ec_dec *dec) {
 static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
                                int ret) {
   int d;
-  OD_ASSERT(rng <= 65535U);
+  assert(rng <= 65535U);
   d = 16 - OD_ILOG_NZ(rng);
   dec->cnt -= d;
   /*This is equivalent to shifting in 1's instead of 0's.*/
@@ -127,9 +127,6 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
 void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
                     uint32_t storage) {
   dec->buf = buf;
-  dec->eptr = buf + storage;
-  dec->end_window = 0;
-  dec->nend_bits = 0;
   dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
   dec->end = buf + storage;
   dec->bptr = buf;
@@ -150,13 +147,14 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
   unsigned r_new;
   unsigned v;
   int ret;
-  OD_ASSERT(0 < f);
-  OD_ASSERT(f < 32768U);
+  assert(0 < f);
+  assert(f < 32768U);
   dif = dec->dif;
   r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(32768U <= r);
-  v = (r >> 8) * (uint32_t)f >> 7;
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
   vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
   ret = 1;
   r_new = v;
@@ -170,8 +168,8 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
 
 /*Decodes a symbol given an inverse cumulative distribution function (CDF)
    table in Q15.
-  icdf: 32768 minus the CDF, such that symbol s falls in the range
-         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
         The values must be monotonically non-increasing, and icdf[nsyms - 1]
          must be 0.
   nsyms: The number of symbols in the alphabet.
@@ -187,62 +185,28 @@ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
   (void)nsyms;
   dif = dec->dif;
   r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U));
-  OD_ASSERT(32768U <= r);
+  const int N = nsyms - 1;
+
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  assert(32768U <= r);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
   c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
   v = r;
   ret = -1;
   do {
     u = v;
-    v = (r >> 8) * (uint32_t)icdf[++ret] >> 7;
+    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT));
+    v += EC_MIN_PROB * (N - ret);
   } while (c < v);
-  OD_ASSERT(v < u);
-  OD_ASSERT(u <= r);
+  assert(v < u);
+  assert(u <= r);
   r = u - v;
   dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
   return od_ec_dec_normalize(dec, dif, r, ret);
 }
 
-#if CONFIG_RAWBITS
-/*Extracts a sequence of raw bits from the stream.
-  The bits must have been encoded with od_ec_enc_bits().
-  ftb: The number of bits to extract.
-       This must be between 0 and 25, inclusive.
-  Return: The decoded bits.*/
-uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
-  od_ec_window window;
-  int available;
-  uint32_t ret;
-  OD_ASSERT(ftb <= 25);
-  window = dec->end_window;
-  available = dec->nend_bits;
-  if ((unsigned)available < ftb) {
-    const unsigned char *buf;
-    const unsigned char *eptr;
-    buf = dec->buf;
-    eptr = dec->eptr;
-    OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
-    do {
-      if (eptr <= buf) {
-        dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
-        available = OD_EC_LOTS_OF_BITS;
-        break;
-      }
-      window |= (od_ec_window) * --eptr << available;
-      available += 8;
-    } while (available <= OD_EC_WINDOW_SIZE - 8);
-    dec->eptr = eptr;
-  }
-  ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
-  window >>= ftb;
-  available -= ftb;
-  dec->end_window = window;
-  dec->nend_bits = available;
-  return ret;
-}
-#endif
-
 /*Returns the number of bits "used" by the decoded symbols so far.
   This same number can be computed in either the encoder or the decoder, and is
    suitable for making coding decisions.
@@ -250,8 +214,7 @@ uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
           This will always be slightly larger than the exact value (e.g., all
            rounding error is in the positive direction).*/
 int od_ec_dec_tell(const od_ec_dec *dec) {
-  return (int)(((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 -
-               dec->cnt - dec->nend_bits + dec->tell_offs);
+  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
 }
 
 /*Returns the number of bits "used" by the decoded symbols so far.
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
index 35ac7fe0d..e35c3f99f 100644
--- a/third_party/aom/aom_dsp/entdec.h
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -32,16 +32,10 @@ typedef struct od_ec_dec od_ec_dec;
 struct od_ec_dec {
   /*The start of the current input buffer.*/
   const unsigned char *buf;
-  /*The read pointer for the raw bits.*/
-  const unsigned char *eptr;
-  /*Bits that will be read from/written at the end.*/
-  od_ec_window end_window;
-  /*Number of valid bits in end_window.*/
-  int nend_bits;
   /*An offset used to keep track of tell after reaching the end of the stream.
     This is constant throughout most of the decoding process, but becomes
      important once we hit the end of the buffer and stop incrementing pointers
-     (and instead pretend cnt/nend_bits have lots of bits).*/
+     (and instead pretend cnt has lots of bits).*/
   int32_t tell_offs;
   /*The end of the current input buffer.*/
   const unsigned char *end;
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
index b8c4dc047..6866de9b9 100644
--- a/third_party/aom/aom_dsp/entenc.c
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -9,13 +9,19 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
+#include <assert.h>
 #include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if OD_MEASURE_EC_OVERHEAD
+#if !defined(M_LOG2E)
+#define M_LOG2E (1.4426950408889634073599246810019)
+#endif
+#define OD_LOG2(x) (M_LOG2E * log(x))
+#endif  // OD_MEASURE_EC_OVERHEAD
 
 /*A range encoder.
   See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
@@ -53,7 +59,7 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
   int c;
   int s;
   c = enc->cnt;
-  OD_ASSERT(rng <= 65535U);
+  assert(rng <= 65535U);
   d = 16 - OD_ILOG_NZ(rng);
   s = c + d;
   /*TODO: Right now we flush every time we have at least one byte available.
@@ -83,13 +89,13 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
     c += 16;
     m = (1 << c) - 1;
     if (s >= 8) {
-      OD_ASSERT(offs < storage);
+      assert(offs < storage);
       buf[offs++] = (uint16_t)(low >> c);
       low &= m;
       c -= 8;
       m >>= 8;
     }
-    OD_ASSERT(offs < storage);
+    assert(offs < storage);
     buf[offs++] = (uint16_t)(low >> c);
     s = c + d - 24;
     low &= m;
@@ -120,9 +126,6 @@ void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
 
 /*Reinitializes the encoder.*/
 void od_ec_enc_reset(od_ec_enc *enc) {
-  enc->end_offs = 0;
-  enc->end_window = 0;
-  enc->nend_bits = 0;
   enc->offs = 0;
   enc->low = 0;
   enc->rng = 0x8000;
@@ -143,31 +146,42 @@ void od_ec_enc_clear(od_ec_enc *enc) {
 }
 
 /*Encodes a symbol given its frequency in Q15.
-  fl: 32768 minus the cumulative frequency of all symbols that come before the
+  fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
+  before the
        one to be encoded.
-  fh: 32768 minus the cumulative frequency of all symbols up to and including
+  fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
+  including
        the one to be encoded.*/
-static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
+                             int nsyms) {
   od_ec_window l;
   unsigned r;
   unsigned u;
   unsigned v;
   l = enc->low;
   r = enc->rng;
-  OD_ASSERT(32768U <= r);
-  OD_ASSERT(fh < fl);
-  OD_ASSERT(fl <= 32768U);
-  if (fl < 32768U) {
-    u = (r >> 8) * (uint32_t)fl >> 7;
-    v = (r >> 8) * (uint32_t)fh >> 7;
+  assert(32768U <= r);
+  assert(fh <= fl);
+  assert(fl <= 32768U);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
+  const int N = nsyms - 1;
+  if (fl < CDF_PROB_TOP) {
+    u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s - 1));
+    v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s + 0));
     l += r - u;
     r = u - v;
   } else {
-    r -= (r >> 8) * (uint32_t)fh >> 7;
+    r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+          (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+         EC_MIN_PROB * (N - (s + 0));
   }
   od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.);
+  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
   enc->nb_symbols++;
 #endif
 }
@@ -179,18 +193,18 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
   od_ec_window l;
   unsigned r;
   unsigned v;
-  OD_ASSERT(0 < f);
-  OD_ASSERT(f < 32768U);
+  assert(0 < f);
+  assert(f < 32768U);
   l = enc->low;
   r = enc->rng;
-  OD_ASSERT(32768U <= r);
-  v = (r >> 8) * (uint32_t)f >> 7;
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
   if (val) l += r - v;
   r = val ? v : r - v;
   od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -=
-      OD_LOG2((double)(val ? 32768 - OD_ICDF(f) : OD_ICDF(f)) / 32768.);
+  enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
   enc->nb_symbols++;
 #endif
 }
@@ -206,67 +220,12 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
 void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
                           int nsyms) {
   (void)nsyms;
-  OD_ASSERT(s >= 0);
-  OD_ASSERT(s < nsyms);
-  OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U));
-  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s]);
+  assert(s >= 0);
+  assert(s < nsyms);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
 }
 
-#if CONFIG_RAWBITS
-/*Encodes a sequence of raw bits in the stream.
-  fl: The bits to encode.
-  ftb: The number of bits to encode.
-       This must be between 0 and 25, inclusive.*/
-void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
-  od_ec_window end_window;
-  int nend_bits;
-  OD_ASSERT(ftb <= 25);
-  OD_ASSERT(fl < (uint32_t)1 << ftb);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy += ftb;
-#endif
-  end_window = enc->end_window;
-  nend_bits = enc->nend_bits;
-  if (nend_bits + ftb > OD_EC_WINDOW_SIZE) {
-    unsigned char *buf;
-    uint32_t storage;
-    uint32_t end_offs;
-    buf = enc->buf;
-    storage = enc->storage;
-    end_offs = enc->end_offs;
-    if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) {
-      unsigned char *new_buf;
-      uint32_t new_storage;
-      new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3);
-      new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage);
-      if (new_buf == NULL) {
-        enc->error = -1;
-        enc->end_offs = 0;
-        return;
-      }
-      OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs,
-              end_offs);
-      storage = new_storage;
-      free(buf);
-      enc->buf = buf = new_buf;
-      enc->storage = storage;
-    }
-    do {
-      OD_ASSERT(end_offs < storage);
-      buf[storage - ++end_offs] = (unsigned char)end_window;
-      end_window >>= 8;
-      nend_bits -= 8;
-    } while (nend_bits >= 8);
-    enc->end_offs = end_offs;
-  }
-  OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE);
-  end_window |= (od_ec_window)fl << nend_bits;
-  nend_bits += ftb;
-  enc->end_window = end_window;
-  enc->nend_bits = nend_bits;
-}
-#endif
-
 /*Overwrites a few bits at the very start of an existing stream, after they
    have already been encoded.
   This makes it possible to have a few flags up front, where it is easy for
@@ -284,9 +243,9 @@ void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
 void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
   int shift;
   unsigned mask;
-  OD_ASSERT(nbits >= 0);
-  OD_ASSERT(nbits <= 8);
-  OD_ASSERT(val < 1U << nbits);
+  assert(nbits >= 0);
+  assert(nbits <= 8);
+  assert(val < 1U << nbits);
   shift = 8 - nbits;
   mask = ((1U << nbits) - 1) << shift;
   if (enc->offs > 0) {
@@ -318,12 +277,9 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
   uint32_t storage;
   uint16_t *buf;
   uint32_t offs;
-  uint32_t end_offs;
-  int nend_bits;
   od_ec_window m;
   od_ec_window e;
   od_ec_window l;
-  unsigned r;
   int c;
   int s;
   if (enc->error) return NULL;
@@ -341,16 +297,10 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
   /*We output the minimum number of bits that ensures that the symbols encoded
      thus far will be decoded correctly regardless of the bits that follow.*/
   l = enc->low;
-  r = enc->rng;
   c = enc->cnt;
-  s = 9;
-  m = 0x7FFF;
-  e = (l + m) & ~m;
-  while ((e | m) >= l + r) {
-    s++;
-    m >>= 1;
-    e = (l + m) & ~m;
-  }
+  s = 10;
+  m = 0x3FFF;
+  e = ((l + m) & ~m) | (m + 1);
   s += c;
   offs = enc->offs;
   buf = enc->precarry_buf;
@@ -369,7 +319,7 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
     }
     n = (1 << (c + 16)) - 1;
     do {
-      OD_ASSERT(offs < storage);
+      assert(offs < storage);
       buf[offs++] = (uint16_t)(e >> (c + 16));
       e &= n;
       s -= 8;
@@ -377,49 +327,31 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
       n >>= 8;
     } while (s > 0);
   }
-  /*Make sure there's enough room for the entropy-coded bits and the raw
-     bits.*/
+  /*Make sure there's enough room for the entropy-coded bits.*/
   out = enc->buf;
   storage = enc->storage;
-  end_offs = enc->end_offs;
-  e = enc->end_window;
-  nend_bits = enc->nend_bits;
-  s = -s;
-  c = OD_MAXI((nend_bits - s + 7) >> 3, 0);
-  if (offs + end_offs + c > storage) {
-    storage = offs + end_offs + c;
+  c = OD_MAXI((s + 7) >> 3, 0);
+  if (offs + c > storage) {
+    storage = offs + c;
     out = (unsigned char *)realloc(out, sizeof(*out) * storage);
     if (out == NULL) {
       enc->error = -1;
       return NULL;
     }
-    OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs);
     enc->buf = out;
     enc->storage = storage;
   }
-  /*If we have buffered raw bits, flush them as well.*/
-  while (nend_bits > s) {
-    OD_ASSERT(end_offs < storage);
-    out[storage - ++end_offs] = (unsigned char)e;
-    e >>= 8;
-    nend_bits -= 8;
-  }
-  *nbytes = offs + end_offs;
+  *nbytes = offs;
   /*Perform carry propagation.*/
-  OD_ASSERT(offs + end_offs <= storage);
-  out = out + storage - (offs + end_offs);
+  assert(offs <= storage);
+  out = out + storage - offs;
   c = 0;
-  end_offs = offs;
   while (offs > 0) {
     offs--;
     c = buf[offs] + c;
     out[offs] = (unsigned char)c;
     c >>= 8;
   }
-  /*Add any remaining raw bits to the last byte.
-    There is guaranteed to be enough room, because nend_bits <= s.*/
-  OD_ASSERT(nend_bits <= 0 || end_offs > 0);
-  if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e;
   /*Note: Unless there's an allocation error, if you keep encoding into the
      current buffer and call this function again later, everything will work
      just fine (you won't get a new packet out, but you will get a single
@@ -441,7 +373,7 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
 int od_ec_enc_tell(const od_ec_enc *enc) {
   /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
      bit, which we reserve for terminating the stream.*/
-  return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10;
+  return (enc->cnt + 10) + enc->offs * 8;
 }
 
 /*Returns the number of bits "used" by the encoded symbols so far.
@@ -476,8 +408,8 @@ void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
   uint32_t storage;
   uint16_t *precarry_buf;
   uint32_t precarry_storage;
-  OD_ASSERT(dst->storage >= src->storage);
-  OD_ASSERT(dst->precarry_storage >= src->precarry_storage);
+  assert(dst->storage >= src->storage);
+  assert(dst->precarry_storage >= src->precarry_storage);
   buf = dst->buf;
   storage = dst->storage;
   precarry_buf = dst->precarry_buf;
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
index 314b36318..1988f6818 100644
--- a/third_party/aom/aom_dsp/entenc.h
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -30,12 +30,6 @@ struct od_ec_enc {
   unsigned char *buf;
   /*The size of the buffer.*/
   uint32_t storage;
-  /*The offset at which the last byte containing raw bits was written.*/
-  uint32_t end_offs;
-  /*Bits that will be read from/written at the end.*/
-  od_ec_window end_window;
-  /*Number of valid bits in end_window.*/
-  int nend_bits;
   /*A buffer for output bytes with their associated carry flags.*/
   uint16_t *precarry_buf;
   /*The size of the pre-carry buffer.*/
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
index 09d945afc..3804519b3 100644
--- a/third_party/aom/aom_dsp/fastssim.c
+++ b/third_party/aom/aom_dsp/fastssim.c
@@ -15,8 +15,10 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/ssim.h"
 #include "aom_ports/system_state.h"
 
@@ -25,12 +27,11 @@ typedef struct fs_ctx fs_ctx;
 
 #define SSIM_C1 (255 * 255 * 0.01 * 0.01)
 #define SSIM_C2 (255 * 255 * 0.03 * 0.03)
-#if CONFIG_HIGHBITDEPTH
 #define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
 #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
 #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
 #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
-#endif
+
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
 
@@ -139,8 +140,8 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
 
 static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
                                  int _s1ystride, const uint8_t *_src2,
-                                 int _s2ystride, int _w, int _h, uint32_t bd,
-                                 uint32_t shift) {
+                                 int _s2ystride, int _w, int _h, uint32_t shift,
+                                 int buf_is_hbd) {
   uint32_t *dst1;
   uint32_t *dst2;
   int w;
@@ -161,7 +162,7 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, _w);
-      if (bd == 8 && shift == 0) {
+      if (!buf_is_hbd) {
         dst1[j * w + i] =
             _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
             _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
@@ -198,13 +199,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   int i;
   int j;
   double ssim_c1 = SSIM_C1;
-#if CONFIG_HIGHBITDEPTH
+
   if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
   if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
-#else
-  assert(bit_depth == 8);
-  (void)bit_depth;
-#endif
+
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
   col_sums_x = _ctx->col_buf;
@@ -323,13 +321,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   int i;
   int j;
   double ssim_c2 = SSIM_C2;
-#if CONFIG_HIGHBITDEPTH
   if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
   if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
-#else
-  assert(bit_depth == 8);
-  (void)bit_depth;
-#endif
 
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
@@ -448,14 +441,14 @@ static double convert_ssim_db(double _ssim, double _weight) {
 
 static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
                         int _dystride, int _w, int _h, uint32_t _bd,
-                        uint32_t _shift) {
+                        uint32_t _shift, int buf_is_hbd) {
   fs_ctx ctx;
   double ret;
   int l;
   ret = 1;
   fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
-  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
-                       _shift);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
+                       buf_is_hbd);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
     fs_calc_structure(&ctx, l, _bd);
     ret *= fs_average(&ctx, l);
@@ -476,18 +469,19 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
   uint32_t bd_shift = 0;
   aom_clear_system_state();
   assert(bd >= in_bd);
-
+  assert(source->flags == dest->flags);
+  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
   bd_shift = bd - in_bd;
 
   *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
                       dest->y_stride, source->y_crop_width,
-                      source->y_crop_height, in_bd, bd_shift);
+                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
   *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift);
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
   *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift);
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
   ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
   return convert_ssim_db(ssimv, 1.0);
 }
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
new file mode 100644
index 000000000..0ba71cfb3
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y++) {
+    for (int x = 0; x < n; x++) {
+      B[y * n + x] = A[x * n + y];
+    }
+  }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+                                    int n) {
+  for (int y = 0; y <= n / 2; ++y) {
+    const int y2 = y + n / 2;
+    const int y_extra = y2 > n / 2 && y2 < n;
+
+    for (int x = 0; x <= n / 2; ++x) {
+      const int x2 = x + n / 2;
+      const int x_extra = x2 > n / 2 && x2 < n;
+      output[2 * (y * n + x)] =
+          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+                                    (x_extra ? col_fft[y * n + x2] : 0);
+      if (y_extra) {
+        output[2 * ((n - y) * n + x)] =
+            col_fft[y * n + x] +
+            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+        output[2 * ((n - y) * n + x) + 1] =
+            -(y_extra ? col_fft[y2 * n + x] : 0) +
+            (x_extra ? col_fft[y * n + x2] : 0);
+      }
+    }
+  }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size) {
+  for (int x = 0; x < n; x += vec_size) {
+    tform(input + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  for (int x = 0; x < n; x += vec_size) {
+    tform(temp + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+          sub_float);
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+          sub_float, mul_float);
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size) {
+  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+  // and get real outputs.
+  for (int y = 0; y <= n / 2; ++y) {
+    output[y * n] = input[2 * y * n];
+    output[y * n + 1] = input[2 * (y * n + n / 2)];
+  }
+  for (int y = n / 2 + 1; y < n; ++y) {
+    output[y * n] = input[2 * (y - n / 2) * n + 1];
+    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+  }
+
+  for (int i = 0; i < 2; i += vec_size) {
+    ifft_multi(output + i, temp + i, n);
+  }
+
+  // For the other columns, since we don't have a full ifft for complex inputs
+  // we have to split them into the real and imaginary counterparts.
+  // Pack the real component, then the imaginary components.
+  for (int y = 0; y < n; ++y) {
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + 1)] = input[2 * (y * n + x)];
+    }
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+    }
+  }
+  for (int y = 2; y < vec_size; y++) {
+    fft_single(output + y, temp + y, n);
+  }
+  // This is the part that can be sped up with SIMD
+  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+    fft_multi(output + y, temp + y, n);
+  }
+
+  // Put the 0 and n/2 th results in the correct place.
+  for (int x = 0; x < n; ++x) {
+    output[x] = temp[x * n];
+    output[(n / 2) * n + x] = temp[x * n + 1];
+  }
+  // This rearranges and transposes.
+  for (int y = 1; y < n / 2; ++y) {
+    // Fill in the real columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + y * n] =
+          temp[(y + 1) + x * n] +
+          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
+    }
+    // Fill in the imag columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + (y + n / 2) * n] =
+          temp[(y + n / 2) + x * n] -
+          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+                                    temp[(y + n / 2) + (n - x) * n];
+    }
+  }
+  for (int y = 0; y < n; y += vec_size) {
+    ifft_multi(output + y, temp + y, n);
+  }
+  transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+           sub_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+                  aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+                  aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+                  aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
new file mode 100644
index 000000000..2f3cd5fdc
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft_common.h
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_FFT_COMMON_H_
+#define AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in]  input   Input buffer. See above for size restrictions.
+ * \param[out] output  Output buffer. See above for size restrictions.
+ * \param[in]  stride  The spacing in number of elements between rows
+ *                     (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+                                  int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+                                         int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in]  input     Input buffer to run the transform on (size n x n)
+ * \param[out] temp      Working buffer for computing the transform (size n x n)
+ * \param[out] output    Output buffer (size 2 x n x n)
+ * \param[in]  tform     Forward transform function
+ * \param[in]  transpose Transpose function (for n x n matrix)
+ * \param[in]  unpack    Unpack function used to massage outputs to correct form
+ * \param[in]  vec_size  Vector size (the transform is done vec_size units at
+ *                       a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp       Working buffer for computations (size 2 x n x n)
+ * \param[out] output     Output buffer (size n x n)
+ * \param[in]  fft_single Forward transform function (non vectorized)
+ * \param[in]  fft_multi  Forward transform function (vectorized)
+ * \param[in]  ifft_multi Inverse transform function (vectorized)
+ * \param[in]  transpose  Transpose function (for n x n matrix)
+ * \param[in]  vec_size   Vector size (the transform is done vec_size
+ *                        units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    store(output + 0 * stride, i0 + i1);                            \
+    store(output + 1 * stride, i0 - i1);                            \
+  }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                            \
+    const T_VEC i1 = load(input + 1 * stride);                            \
+    const T_VEC i2 = load(input + 2 * stride);                            \
+    const T_VEC i3 = load(input + 3 * stride);                            \
+    const T_VEC w0 = add(i0, i2);                                         \
+    const T_VEC w1 = sub(i0, i2);                                         \
+    const T_VEC w2 = add(i1, i3);                                         \
+    const T_VEC w3 = sub(i1, i3);                                         \
+    store(output + 0 * stride, add(w0, w2));                              \
+    store(output + 1 * stride, w1);                                       \
+    store(output + 2 * stride, sub(w0, w2));                              \
+    store(output + 3 * stride, sub(kWeight0, w3));                        \
+  }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC w0 = add(i0, i4);                                              \
+    const T_VEC w1 = sub(i0, i4);                                              \
+    const T_VEC w2 = add(i2, i6);                                              \
+    const T_VEC w3 = sub(i2, i6);                                              \
+    const T_VEC w4 = add(w0, w2);                                              \
+    const T_VEC w5 = sub(w0, w2);                                              \
+    const T_VEC w7 = add(i1, i5);                                              \
+    const T_VEC w8 = sub(i1, i5);                                              \
+    const T_VEC w9 = add(i3, i7);                                              \
+    const T_VEC w10 = sub(i3, i7);                                             \
+    const T_VEC w11 = add(w7, w9);                                             \
+    const T_VEC w12 = sub(w7, w9);                                             \
+    store(output + 0 * stride, add(w4, w11));                                  \
+    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 2 * stride, w5);                                            \
+    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 4 * stride, sub(w4, w11));                                  \
+    store(output + 5 * stride,                                                 \
+          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
+    store(output + 6 * stride, sub(kWeight0, w12));                            \
+    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
+  }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC kWeight3 = constant(0.92388f);                             \
+    const T_VEC kWeight4 = constant(0.382683f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC i8 = load(input + 8 * stride);                             \
+    const T_VEC i9 = load(input + 9 * stride);                             \
+    const T_VEC i10 = load(input + 10 * stride);                           \
+    const T_VEC i11 = load(input + 11 * stride);                           \
+    const T_VEC i12 = load(input + 12 * stride);                           \
+    const T_VEC i13 = load(input + 13 * stride);                           \
+    const T_VEC i14 = load(input + 14 * stride);                           \
+    const T_VEC i15 = load(input + 15 * stride);                           \
+    const T_VEC w0 = add(i0, i8);                                          \
+    const T_VEC w1 = sub(i0, i8);                                          \
+    const T_VEC w2 = add(i4, i12);                                         \
+    const T_VEC w3 = sub(i4, i12);                                         \
+    const T_VEC w4 = add(w0, w2);                                          \
+    const T_VEC w5 = sub(w0, w2);                                          \
+    const T_VEC w7 = add(i2, i10);                                         \
+    const T_VEC w8 = sub(i2, i10);                                         \
+    const T_VEC w9 = add(i6, i14);                                         \
+    const T_VEC w10 = sub(i6, i14);                                        \
+    const T_VEC w11 = add(w7, w9);                                         \
+    const T_VEC w12 = sub(w7, w9);                                         \
+    const T_VEC w14 = add(w4, w11);                                        \
+    const T_VEC w15 = sub(w4, w11);                                        \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(sub(kWeight0, w3),                          \
+                               mul(kWeight2, add(w10, w8))) };             \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
+    const T_VEC w19 = add(i1, i9);                                         \
+    const T_VEC w20 = sub(i1, i9);                                         \
+    const T_VEC w21 = add(i5, i13);                                        \
+    const T_VEC w22 = sub(i5, i13);                                        \
+    const T_VEC w23 = add(w19, w21);                                       \
+    const T_VEC w24 = sub(w19, w21);                                       \
+    const T_VEC w26 = add(i3, i11);                                        \
+    const T_VEC w27 = sub(i3, i11);                                        \
+    const T_VEC w28 = add(i7, i15);                                        \
+    const T_VEC w29 = sub(i7, i15);                                        \
+    const T_VEC w30 = add(w26, w28);                                       \
+    const T_VEC w31 = sub(w26, w28);                                       \
+    const T_VEC w33 = add(w23, w30);                                       \
+    const T_VEC w34 = sub(w23, w30);                                       \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(sub(kWeight0, w22),                         \
+                               mul(kWeight2, add(w29, w27))) };            \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
+    store(output + 0 * stride, add(w14, w33));                             \
+    store(output + 1 * stride,                                             \
+          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 3 * stride,                                             \
+          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+    store(output + 4 * stride, w15);                                       \
+    store(output + 5 * stride,                                             \
+          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
+                          mul(kWeight3, w37[1]))));                        \
+    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 7 * stride,                                             \
+          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
+                          mul(kWeight4, w35[1]))));                        \
+    store(output + 8 * stride, sub(w14, w33));                             \
+    store(output + 9 * stride,                                             \
+          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+    store(output + 10 * stride,                                            \
+          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
+    store(output + 11 * stride,                                            \
+          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+    store(output + 12 * stride, sub(kWeight0, w34));                       \
+    store(output + 13 * stride,                                            \
+          sub(sub(kWeight0, w18[1]),                                       \
+              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
+    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
+    store(output + 15 * stride,                                            \
+          sub(sub(kWeight0, w16[1]),                                       \
+              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
+  }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                   mul)                                                      \
+  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                   \
+    const T_VEC kWeight2 = constant(0.707107f);                              \
+    const T_VEC kWeight3 = constant(0.92388f);                               \
+    const T_VEC kWeight4 = constant(0.382683f);                              \
+    const T_VEC kWeight5 = constant(0.980785f);                              \
+    const T_VEC kWeight6 = constant(0.19509f);                               \
+    const T_VEC kWeight7 = constant(0.83147f);                               \
+    const T_VEC kWeight8 = constant(0.55557f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                               \
+    const T_VEC i1 = load(input + 1 * stride);                               \
+    const T_VEC i2 = load(input + 2 * stride);                               \
+    const T_VEC i3 = load(input + 3 * stride);                               \
+    const T_VEC i4 = load(input + 4 * stride);                               \
+    const T_VEC i5 = load(input + 5 * stride);                               \
+    const T_VEC i6 = load(input + 6 * stride);                               \
+    const T_VEC i7 = load(input + 7 * stride);                               \
+    const T_VEC i8 = load(input + 8 * stride);                               \
+    const T_VEC i9 = load(input + 9 * stride);                               \
+    const T_VEC i10 = load(input + 10 * stride);                             \
+    const T_VEC i11 = load(input + 11 * stride);                             \
+    const T_VEC i12 = load(input + 12 * stride);                             \
+    const T_VEC i13 = load(input + 13 * stride);                             \
+    const T_VEC i14 = load(input + 14 * stride);                             \
+    const T_VEC i15 = load(input + 15 * stride);                             \
+    const T_VEC i16 = load(input + 16 * stride);                             \
+    const T_VEC i17 = load(input + 17 * stride);                             \
+    const T_VEC i18 = load(input + 18 * stride);                             \
+    const T_VEC i19 = load(input + 19 * stride);                             \
+    const T_VEC i20 = load(input + 20 * stride);                             \
+    const T_VEC i21 = load(input + 21 * stride);                             \
+    const T_VEC i22 = load(input + 22 * stride);                             \
+    const T_VEC i23 = load(input + 23 * stride);                             \
+    const T_VEC i24 = load(input + 24 * stride);                             \
+    const T_VEC i25 = load(input + 25 * stride);                             \
+    const T_VEC i26 = load(input + 26 * stride);                             \
+    const T_VEC i27 = load(input + 27 * stride);                             \
+    const T_VEC i28 = load(input + 28 * stride);                             \
+    const T_VEC i29 = load(input + 29 * stride);                             \
+    const T_VEC i30 = load(input + 30 * stride);                             \
+    const T_VEC i31 = load(input + 31 * stride);                             \
+    const T_VEC w0 = add(i0, i16);                                           \
+    const T_VEC w1 = sub(i0, i16);                                           \
+    const T_VEC w2 = add(i8, i24);                                           \
+    const T_VEC w3 = sub(i8, i24);                                           \
+    const T_VEC w4 = add(w0, w2);                                            \
+    const T_VEC w5 = sub(w0, w2);                                            \
+    const T_VEC w7 = add(i4, i20);                                           \
+    const T_VEC w8 = sub(i4, i20);                                           \
+    const T_VEC w9 = add(i12, i28);                                          \
+    const T_VEC w10 = sub(i12, i28);                                         \
+    const T_VEC w11 = add(w7, w9);                                           \
+    const T_VEC w12 = sub(w7, w9);                                           \
+    const T_VEC w14 = add(w4, w11);                                          \
+    const T_VEC w15 = sub(w4, w11);                                          \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(sub(kWeight0, w3),                            \
+                               mul(kWeight2, add(w10, w8))) };               \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
+    const T_VEC w19 = add(i2, i18);                                          \
+    const T_VEC w20 = sub(i2, i18);                                          \
+    const T_VEC w21 = add(i10, i26);                                         \
+    const T_VEC w22 = sub(i10, i26);                                         \
+    const T_VEC w23 = add(w19, w21);                                         \
+    const T_VEC w24 = sub(w19, w21);                                         \
+    const T_VEC w26 = add(i6, i22);                                          \
+    const T_VEC w27 = sub(i6, i22);                                          \
+    const T_VEC w28 = add(i14, i30);                                         \
+    const T_VEC w29 = sub(i14, i30);                                         \
+    const T_VEC w30 = add(w26, w28);                                         \
+    const T_VEC w31 = sub(w26, w28);                                         \
+    const T_VEC w33 = add(w23, w30);                                         \
+    const T_VEC w34 = sub(w23, w30);                                         \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(sub(kWeight0, w22),                           \
+                               mul(kWeight2, add(w29, w27))) };              \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
+    const T_VEC w38 = add(w14, w33);                                         \
+    const T_VEC w39 = sub(w14, w33);                                         \
+    const T_VEC w40[2] = {                                                   \
+      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
+      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
+    };                                                                       \
+    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(sub(kWeight0, w12),                           \
+                               mul(kWeight2, add(w31, w24))) };              \
+    const T_VEC w42[2] = {                                                   \
+      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
+      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
+    };                                                                       \
+    const T_VEC w44[2] = {                                                   \
+      add(w18[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+      sub(sub(kWeight0, w18[1]),                                             \
+          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
+    };                                                                       \
+    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
+    const T_VEC w46[2] = {                                                   \
+      add(w16[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+      sub(sub(kWeight0, w16[1]),                                             \
+          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
+    };                                                                       \
+    const T_VEC w47 = add(i1, i17);                                          \
+    const T_VEC w48 = sub(i1, i17);                                          \
+    const T_VEC w49 = add(i9, i25);                                          \
+    const T_VEC w50 = sub(i9, i25);                                          \
+    const T_VEC w51 = add(w47, w49);                                         \
+    const T_VEC w52 = sub(w47, w49);                                         \
+    const T_VEC w54 = add(i5, i21);                                          \
+    const T_VEC w55 = sub(i5, i21);                                          \
+    const T_VEC w56 = add(i13, i29);                                         \
+    const T_VEC w57 = sub(i13, i29);                                         \
+    const T_VEC w58 = add(w54, w56);                                         \
+    const T_VEC w59 = sub(w54, w56);                                         \
+    const T_VEC w61 = add(w51, w58);                                         \
+    const T_VEC w62 = sub(w51, w58);                                         \
+    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(sub(kWeight0, w50),                           \
+                               mul(kWeight2, add(w57, w55))) };              \
+    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
+    const T_VEC w66 = add(i3, i19);                                          \
+    const T_VEC w67 = sub(i3, i19);                                          \
+    const T_VEC w68 = add(i11, i27);                                         \
+    const T_VEC w69 = sub(i11, i27);                                         \
+    const T_VEC w70 = add(w66, w68);                                         \
+    const T_VEC w71 = sub(w66, w68);                                         \
+    const T_VEC w73 = add(i7, i23);                                          \
+    const T_VEC w74 = sub(i7, i23);                                          \
+    const T_VEC w75 = add(i15, i31);                                         \
+    const T_VEC w76 = sub(i15, i31);                                         \
+    const T_VEC w77 = add(w73, w75);                                         \
+    const T_VEC w78 = sub(w73, w75);                                         \
+    const T_VEC w80 = add(w70, w77);                                         \
+    const T_VEC w81 = sub(w70, w77);                                         \
+    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(sub(kWeight0, w69),                           \
+                               mul(kWeight2, add(w76, w74))) };              \
+    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
+    const T_VEC w85 = add(w61, w80);                                         \
+    const T_VEC w86 = sub(w61, w80);                                         \
+    const T_VEC w87[2] = {                                                   \
+      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
+      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
+    };                                                                       \
+    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(sub(kWeight0, w59),                           \
+                               mul(kWeight2, add(w78, w71))) };              \
+    const T_VEC w89[2] = {                                                   \
+      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
+      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
+    };                                                                       \
+    const T_VEC w91[2] = {                                                   \
+      add(w65[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+      sub(sub(kWeight0, w65[1]),                                             \
+          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
+    };                                                                       \
+    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
+    const T_VEC w93[2] = {                                                   \
+      add(w63[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+      sub(sub(kWeight0, w63[1]),                                             \
+          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
+    };                                                                       \
+    store(output + 0 * stride, add(w38, w85));                               \
+    store(output + 1 * stride,                                               \
+          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
+    store(output + 2 * stride,                                               \
+          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
+    store(output + 3 * stride,                                               \
+          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
+    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
+    store(output + 5 * stride,                                               \
+          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
+    store(output + 6 * stride,                                               \
+          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
+    store(output + 7 * stride,                                               \
+          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
+    store(output + 8 * stride, w39);                                         \
+    store(output + 9 * stride,                                               \
+          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
+                          mul(kWeight5, w93[1]))));                          \
+    store(output + 10 * stride,                                              \
+          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
+                          mul(kWeight3, w92[1]))));                          \
+    store(output + 11 * stride,                                              \
+          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
+                          mul(kWeight7, w91[1]))));                          \
+    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
+    store(output + 13 * stride,                                              \
+          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
+                          mul(kWeight8, w89[1]))));                          \
+    store(output + 14 * stride,                                              \
+          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
+                          mul(kWeight4, w88[1]))));                          \
+    store(output + 15 * stride,                                              \
+          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
+                          mul(kWeight6, w87[1]))));                          \
+    store(output + 16 * stride, sub(w38, w85));                              \
+    store(output + 17 * stride,                                              \
+          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
+    store(output + 18 * stride,                                              \
+          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
+    store(output + 19 * stride,                                              \
+          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
+    store(output + 20 * stride,                                              \
+          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
+    store(output + 21 * stride,                                              \
+          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
+    store(output + 22 * stride,                                              \
+          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
+    store(output + 23 * stride,                                              \
+          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
+    store(output + 24 * stride, sub(kWeight0, w86));                         \
+    store(output + 25 * stride,                                              \
+          sub(sub(kWeight0, w46[1]),                                         \
+              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
+    store(output + 26 * stride,                                              \
+          sub(sub(kWeight0, w45[1]),                                         \
+              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
+    store(output + 27 * stride,                                              \
+          sub(sub(kWeight0, w44[1]),                                         \
+              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
+    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
+    store(output + 29 * stride,                                              \
+          sub(sub(kWeight0, w42[1]),                                         \
+              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
+    store(output + 30 * stride,                                              \
+          sub(sub(kWeight0, w41[1]),                                         \
+              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
+    store(output + 31 * stride,                                              \
+          sub(sub(kWeight0, w40[1]),                                         \
+              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
+  }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    store(output + 0 * stride, i0 + i1);                             \
+    store(output + 1 * stride, i0 - i1);                             \
+  }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC w2 = add(i0, i2);                                          \
+    const T_VEC w3 = sub(i0, i2);                                          \
+    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
+    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
+    store(output + 0 * stride, add(w2, w4[0]));                            \
+    store(output + 1 * stride, add(w3, w5[1]));                            \
+    store(output + 2 * stride, sub(w2, w4[0]));                            \
+    store(output + 3 * stride, sub(w3, w5[1]));                            \
+  }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC w6 = add(i0, i4);                                          \
+    const T_VEC w7 = sub(i0, i4);                                          \
+    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
+    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
+    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
+    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
+    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
+    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
+    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
+    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
+    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
+    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
+    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
+    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
+    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
+    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
+    store(output + 0 * stride, add(w10[0], w18[0]));                       \
+    store(output + 1 * stride,                                             \
+          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
+    store(output + 2 * stride, add(w11[0], w19[1]));                       \
+    store(output + 3 * stride,                                             \
+          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
+    store(output + 5 * stride,                                             \
+          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
+                          mul(kWeight2, w20[1]))));                        \
+    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
+    store(output + 7 * stride,                                             \
+          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+  }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                    mul)                                                      \
+  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                    \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC w14 = add(i0, i8);                                            \
+    const T_VEC w15 = sub(i0, i8);                                            \
+    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
+    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
+    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
+    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
+    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
+    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
+    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
+    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
+    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
+    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
+    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
+    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
+    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
+    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
+    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
+    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
+    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
+                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+    const T_VEC w33[2] = { add(w20[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
+                                   mul(kWeight2, w28[1]))),                   \
+                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
+    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
+    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
+    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
+    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
+    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
+    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
+    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
+    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
+    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
+    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
+    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
+    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
+    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
+    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
+    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
+    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
+                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+    const T_VEC w57[2] = { add(w44[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
+                                   mul(kWeight2, w52[1]))),                   \
+                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
+    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
+    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    store(output + 0 * stride, add(w30[0], w54[0]));                          \
+    store(output + 1 * stride,                                                \
+          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
+    store(output + 2 * stride,                                                \
+          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
+    store(output + 3 * stride,                                                \
+          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
+    store(output + 4 * stride, add(w31[0], w55[1]));                          \
+    store(output + 5 * stride,                                                \
+          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 6 * stride,                                                \
+          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 7 * stride,                                                \
+          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
+    store(output + 9 * stride,                                                \
+          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
+                          mul(kWeight4, w56[1]))));                           \
+    store(output + 10 * stride,                                               \
+          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
+                          mul(kWeight2, w58[1]))));                           \
+    store(output + 11 * stride,                                               \
+          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
+                          mul(kWeight3, w60[1]))));                           \
+    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
+    store(output + 13 * stride,                                               \
+          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 14 * stride,                                               \
+          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 15 * stride,                                               \
+          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+  }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
+                    mul)                                                       \
+  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC kWeight3 = constant(0.92388f);                                 \
+    const T_VEC kWeight4 = constant(0.382683f);                                \
+    const T_VEC kWeight5 = constant(0.980785f);                                \
+    const T_VEC kWeight6 = constant(0.19509f);                                 \
+    const T_VEC kWeight7 = constant(0.83147f);                                 \
+    const T_VEC kWeight8 = constant(0.55557f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC i8 = load(input + 8 * stride);                                 \
+    const T_VEC i9 = load(input + 9 * stride);                                 \
+    const T_VEC i10 = load(input + 10 * stride);                               \
+    const T_VEC i11 = load(input + 11 * stride);                               \
+    const T_VEC i12 = load(input + 12 * stride);                               \
+    const T_VEC i13 = load(input + 13 * stride);                               \
+    const T_VEC i14 = load(input + 14 * stride);                               \
+    const T_VEC i15 = load(input + 15 * stride);                               \
+    const T_VEC i16 = load(input + 16 * stride);                               \
+    const T_VEC i17 = load(input + 17 * stride);                               \
+    const T_VEC i18 = load(input + 18 * stride);                               \
+    const T_VEC i19 = load(input + 19 * stride);                               \
+    const T_VEC i20 = load(input + 20 * stride);                               \
+    const T_VEC i21 = load(input + 21 * stride);                               \
+    const T_VEC i22 = load(input + 22 * stride);                               \
+    const T_VEC i23 = load(input + 23 * stride);                               \
+    const T_VEC i24 = load(input + 24 * stride);                               \
+    const T_VEC i25 = load(input + 25 * stride);                               \
+    const T_VEC i26 = load(input + 26 * stride);                               \
+    const T_VEC i27 = load(input + 27 * stride);                               \
+    const T_VEC i28 = load(input + 28 * stride);                               \
+    const T_VEC i29 = load(input + 29 * stride);                               \
+    const T_VEC i30 = load(input + 30 * stride);                               \
+    const T_VEC i31 = load(input + 31 * stride);                               \
+    const T_VEC w30 = add(i0, i16);                                            \
+    const T_VEC w31 = sub(i0, i16);                                            \
+    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
+    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
+    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
+    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
+    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
+    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
+    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
+    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
+    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
+    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
+    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
+    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
+    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
+                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
+    const T_VEC w49[2] = { add(w36[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
+                                   mul(kWeight2, w44[1]))),                    \
+                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
+    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
+    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
+    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
+    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
+    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
+    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
+    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
+    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
+    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
+    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
+    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
+    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
+    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
+    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
+    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
+    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
+    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
+    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
+    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
+    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
+    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
+                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
+    const T_VEC w73[2] = { add(w60[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
+                                   mul(kWeight2, w68[1]))),                    \
+                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
+    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
+    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
+    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
+    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
+    const T_VEC w80[2] = {                                                     \
+      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
+      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
+    };                                                                         \
+    const T_VEC w81[2] = {                                                     \
+      add(w48[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
+      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
+    };                                                                         \
+    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
+                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
+    const T_VEC w83[2] = { add(w50[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
+                                   mul(kWeight2, w74[1]))),                    \
+                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
+    const T_VEC w84[2] = {                                                     \
+      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
+      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
+    };                                                                         \
+    const T_VEC w85[2] = {                                                     \
+      add(w52[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
+      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
+    };                                                                         \
+    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
+    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
+    const T_VEC w88[2] = {                                                     \
+      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
+    };                                                                         \
+    const T_VEC w89[2] = {                                                     \
+      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
+    };                                                                         \
+    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w92[2] = {                                                     \
+      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
+    };                                                                         \
+    const T_VEC w93[2] = {                                                     \
+      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
+    };                                                                         \
+    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
+    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
+    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
+    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
+    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
+    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
+    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
+    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
+    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
+    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
+    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
+    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
+    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
+    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
+    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
+    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
+    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
+    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
+    const T_VEC w112[2] = {                                                    \
+      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
+      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
+    };                                                                         \
+    const T_VEC w113[2] = {                                                    \
+      add(w100[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
+    };                                                                         \
+    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
+    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
+    const T_VEC w116[2] = {                                                    \
+      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w117[2] = {                                                    \
+      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
+    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
+    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
+    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
+    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
+    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
+    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
+    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
+    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
+    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
+    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
+    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
+    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
+    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
+    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
+    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
+    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
+    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
+    const T_VEC w136[2] = {                                                    \
+      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
+      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
+    };                                                                         \
+    const T_VEC w137[2] = {                                                    \
+      add(w124[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
+    };                                                                         \
+    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
+    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
+    const T_VEC w140[2] = {                                                    \
+      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w141[2] = {                                                    \
+      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
+    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
+    const T_VEC w144[2] = {                                                    \
+      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
+      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
+    };                                                                         \
+    const T_VEC w145[2] = {                                                    \
+      add(w112[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
+    };                                                                         \
+    const T_VEC w146[2] = {                                                    \
+      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
+      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
+    };                                                                         \
+    const T_VEC w147[2] = {                                                    \
+      add(w114[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
+    };                                                                         \
+    const T_VEC w148[2] = {                                                    \
+      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
+      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
+    };                                                                         \
+    const T_VEC w149[2] = {                                                    \
+      add(w116[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
+    };                                                                         \
+    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
+    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
+    const T_VEC w152[2] = {                                                    \
+      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
+    };                                                                         \
+    const T_VEC w153[2] = {                                                    \
+      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
+    };                                                                         \
+    const T_VEC w154[2] = {                                                    \
+      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w155[2] = {                                                    \
+      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w156[2] = {                                                    \
+      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
+    };                                                                         \
+    const T_VEC w157[2] = {                                                    \
+      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
+    };                                                                         \
+    store(output + 0 * stride, add(w78[0], w142[0]));                          \
+    store(output + 1 * stride,                                                 \
+          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
+    store(output + 2 * stride,                                                 \
+          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
+    store(output + 3 * stride,                                                 \
+          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
+    store(output + 4 * stride,                                                 \
+          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
+    store(output + 5 * stride,                                                 \
+          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
+    store(output + 6 * stride,                                                 \
+          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
+    store(output + 7 * stride,                                                 \
+          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
+    store(output + 8 * stride, add(w79[0], w143[1]));                          \
+    store(output + 9 * stride,                                                 \
+          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 10 * stride,                                                \
+          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 11 * stride,                                                \
+          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 12 * stride,                                                \
+          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 13 * stride,                                                \
+          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 14 * stride,                                                \
+          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 15 * stride,                                                \
+          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
+    store(output + 17 * stride,                                                \
+          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
+                          mul(kWeight6, w144[1]))));                           \
+    store(output + 18 * stride,                                                \
+          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
+                          mul(kWeight4, w146[1]))));                           \
+    store(output + 19 * stride,                                                \
+          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
+                          mul(kWeight8, w148[1]))));                           \
+    store(output + 20 * stride,                                                \
+          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
+                          mul(kWeight2, w150[1]))));                           \
+    store(output + 21 * stride,                                                \
+          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
+                          mul(kWeight7, w152[1]))));                           \
+    store(output + 22 * stride,                                                \
+          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
+                          mul(kWeight3, w154[1]))));                           \
+    store(output + 23 * stride,                                                \
+          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
+                          mul(kWeight5, w156[1]))));                           \
+    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
+    store(output + 25 * stride,                                                \
+          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 26 * stride,                                                \
+          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 27 * stride,                                                \
+          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 28 * stride,                                                \
+          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 29 * stride,                                                \
+          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 30 * stride,                                                \
+          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 31 * stride,                                                \
+          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+  }
+
+#endif  // AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
index 1ceef7782..e50f951c1 100644
--- a/third_party/aom/aom_dsp/fwd_txfm.c
+++ b/third_party/aom/aom_dsp/fwd_txfm.c
@@ -9,84 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_dsp/fwd_txfm.h"
 #include <assert.h>
-#include "./aom_dsp_rtcd.h"
-
-void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[4 * 4];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t in_high[4];    // canbe16
-    tran_high_t step[4];       // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 4; ++i) {
-      // Load inputs.
-      if (pass == 0) {
-        in_high[0] = input[0 * stride] * 16;
-        in_high[1] = input[1 * stride] * 16;
-        in_high[2] = input[2 * stride] * 16;
-        in_high[3] = input[3 * stride] * 16;
-        if (i == 0 && in_high[0]) {
-          ++in_high[0];
-        }
-      } else {
-        assert(in_low != NULL);
-        in_high[0] = in_low[0 * 4];
-        in_high[1] = in_low[1 * 4];
-        in_high[2] = in_low[2 * 4];
-        in_high[3] = in_low[3 * 4];
-        ++in_low;
-      }
-      // Transform.
-      step[0] = in_high[0] + in_high[3];
-      step[1] = in_high[1] + in_high[2];
-      step[2] = in_high[1] - in_high[2];
-      step[3] = in_high[0] - in_high[3];
-      temp1 = (step[0] + step[1]) * cospi_16_64;
-      temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = (tran_low_t)fdct_round_shift(temp1);
-      out[2] = (tran_low_t)fdct_round_shift(temp2);
-      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = (tran_low_t)fdct_round_shift(temp1);
-      out[3] = (tran_low_t)fdct_round_shift(temp2);
-      // Do next column (which is a transposed row in second/horizontal pass)
-      ++input;
-      out += 4;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-
-  {
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
-    }
-  }
-}
-
-void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c) sum += input[r * stride + c];
-
-  output[0] = sum << 1;
-}
+#include "aom_dsp/txfm_common.h"
+#include "config/aom_dsp_rtcd.h"
 
 void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
   int i, j;
@@ -172,596 +97,7 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
   }
 }
 
-void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[256];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t step1[8];      // canbe16
-    tran_high_t step2[8];      // canbe16
-    tran_high_t step3[8];      // canbe16
-    tran_high_t in_high[8];    // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
-        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
-        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
-        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
-        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
-        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
-        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
-        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
-        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
-        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
-        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
-        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
-        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
-        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
-        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        assert(in_low != NULL);
-        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
-        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
-        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
-        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
-        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
-        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
-        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
-        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
-        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
-        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
-        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
-        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
-        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
-        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
-        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
-        in_low++;
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-        tran_high_t t0, t1, t2, t3;                  // needs32
-        tran_high_t x0, x1, x2, x3;                  // canbe16
-
-        // stage 1
-        s0 = in_high[0] + in_high[7];
-        s1 = in_high[1] + in_high[6];
-        s2 = in_high[2] + in_high[5];
-        s3 = in_high[3] + in_high[4];
-        s4 = in_high[3] - in_high[4];
-        s5 = in_high[2] - in_high[5];
-        s6 = in_high[1] - in_high[6];
-        s7 = in_high[0] - in_high[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = (tran_low_t)fdct_round_shift(t0);
-        out[4] = (tran_low_t)fdct_round_shift(t2);
-        out[8] = (tran_low_t)fdct_round_shift(t1);
-        out[12] = (tran_low_t)fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-        out[2] = (tran_low_t)fdct_round_shift(t0);
-        out[6] = (tran_low_t)fdct_round_shift(t2);
-        out[10] = (tran_low_t)fdct_round_shift(t1);
-        out[14] = (tran_low_t)fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = (tran_low_t)fdct_round_shift(temp1);
-        out[9] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = (tran_low_t)fdct_round_shift(temp1);
-        out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = (tran_low_t)fdct_round_shift(temp1);
-        out[11] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = (tran_low_t)fdct_round_shift(temp1);
-        out[15] = (tran_low_t)fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-}
-
-static INLINE tran_high_t dct_32_round(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
-  // and make the bounds consts.
-  // assert(-131072 <= rv && rv <= 131071);
-  return rv;
-}
-
-static INLINE tran_high_t half_round_shift(tran_high_t input) {
-  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
-  return rv;
-}
-
-void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
-  tran_high_t step[32];
-  // Stage 1
-  step[0] = input[0] + input[(32 - 1)];
-  step[1] = input[1] + input[(32 - 2)];
-  step[2] = input[2] + input[(32 - 3)];
-  step[3] = input[3] + input[(32 - 4)];
-  step[4] = input[4] + input[(32 - 5)];
-  step[5] = input[5] + input[(32 - 6)];
-  step[6] = input[6] + input[(32 - 7)];
-  step[7] = input[7] + input[(32 - 8)];
-  step[8] = input[8] + input[(32 - 9)];
-  step[9] = input[9] + input[(32 - 10)];
-  step[10] = input[10] + input[(32 - 11)];
-  step[11] = input[11] + input[(32 - 12)];
-  step[12] = input[12] + input[(32 - 13)];
-  step[13] = input[13] + input[(32 - 14)];
-  step[14] = input[14] + input[(32 - 15)];
-  step[15] = input[15] + input[(32 - 16)];
-  step[16] = -input[16] + input[(32 - 17)];
-  step[17] = -input[17] + input[(32 - 18)];
-  step[18] = -input[18] + input[(32 - 19)];
-  step[19] = -input[19] + input[(32 - 20)];
-  step[20] = -input[20] + input[(32 - 21)];
-  step[21] = -input[21] + input[(32 - 22)];
-  step[22] = -input[22] + input[(32 - 23)];
-  step[23] = -input[23] + input[(32 - 24)];
-  step[24] = -input[24] + input[(32 - 25)];
-  step[25] = -input[25] + input[(32 - 26)];
-  step[26] = -input[26] + input[(32 - 27)];
-  step[27] = -input[27] + input[(32 - 28)];
-  step[28] = -input[28] + input[(32 - 29)];
-  step[29] = -input[29] + input[(32 - 30)];
-  step[30] = -input[30] + input[(32 - 31)];
-  step[31] = -input[31] + input[(32 - 32)];
-
-  // Stage 2
-  output[0] = step[0] + step[16 - 1];
-  output[1] = step[1] + step[16 - 2];
-  output[2] = step[2] + step[16 - 3];
-  output[3] = step[3] + step[16 - 4];
-  output[4] = step[4] + step[16 - 5];
-  output[5] = step[5] + step[16 - 6];
-  output[6] = step[6] + step[16 - 7];
-  output[7] = step[7] + step[16 - 8];
-  output[8] = -step[8] + step[16 - 9];
-  output[9] = -step[9] + step[16 - 10];
-  output[10] = -step[10] + step[16 - 11];
-  output[11] = -step[11] + step[16 - 12];
-  output[12] = -step[12] + step[16 - 13];
-  output[13] = -step[13] + step[16 - 14];
-  output[14] = -step[14] + step[16 - 15];
-  output[15] = -step[15] + step[16 - 16];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = step[18];
-  output[19] = step[19];
-
-  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
-  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
-  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
-  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
-  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
-  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
-  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
-  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
-  output[28] = step[28];
-  output[29] = step[29];
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // dump the magnitude by 4, hence the intermediate values are within
-  // the range of 16 bits.
-  if (round) {
-    output[0] = half_round_shift(output[0]);
-    output[1] = half_round_shift(output[1]);
-    output[2] = half_round_shift(output[2]);
-    output[3] = half_round_shift(output[3]);
-    output[4] = half_round_shift(output[4]);
-    output[5] = half_round_shift(output[5]);
-    output[6] = half_round_shift(output[6]);
-    output[7] = half_round_shift(output[7]);
-    output[8] = half_round_shift(output[8]);
-    output[9] = half_round_shift(output[9]);
-    output[10] = half_round_shift(output[10]);
-    output[11] = half_round_shift(output[11]);
-    output[12] = half_round_shift(output[12]);
-    output[13] = half_round_shift(output[13]);
-    output[14] = half_round_shift(output[14]);
-    output[15] = half_round_shift(output[15]);
-
-    output[16] = half_round_shift(output[16]);
-    output[17] = half_round_shift(output[17]);
-    output[18] = half_round_shift(output[18]);
-    output[19] = half_round_shift(output[19]);
-    output[20] = half_round_shift(output[20]);
-    output[21] = half_round_shift(output[21]);
-    output[22] = half_round_shift(output[22]);
-    output[23] = half_round_shift(output[23]);
-    output[24] = half_round_shift(output[24]);
-    output[25] = half_round_shift(output[25]);
-    output[26] = half_round_shift(output[26]);
-    output[27] = half_round_shift(output[27]);
-    output[28] = half_round_shift(output[28]);
-    output[29] = half_round_shift(output[29]);
-    output[30] = half_round_shift(output[30]);
-    output[31] = half_round_shift(output[31]);
-  }
-
-  // Stage 3
-  step[0] = output[0] + output[(8 - 1)];
-  step[1] = output[1] + output[(8 - 2)];
-  step[2] = output[2] + output[(8 - 3)];
-  step[3] = output[3] + output[(8 - 4)];
-  step[4] = -output[4] + output[(8 - 5)];
-  step[5] = -output[5] + output[(8 - 6)];
-  step[6] = -output[6] + output[(8 - 7)];
-  step[7] = -output[7] + output[(8 - 8)];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
-  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
-  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
-  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  step[16] = output[16] + output[23];
-  step[17] = output[17] + output[22];
-  step[18] = output[18] + output[21];
-  step[19] = output[19] + output[20];
-  step[20] = -output[20] + output[19];
-  step[21] = -output[21] + output[18];
-  step[22] = -output[22] + output[17];
-  step[23] = -output[23] + output[16];
-  step[24] = -output[24] + output[31];
-  step[25] = -output[25] + output[30];
-  step[26] = -output[26] + output[29];
-  step[27] = -output[27] + output[28];
-  step[28] = output[28] + output[27];
-  step[29] = output[29] + output[26];
-  step[30] = output[30] + output[25];
-  step[31] = output[31] + output[24];
-
-  // Stage 4
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = -step[2] + step[1];
-  output[3] = -step[3] + step[0];
-  output[4] = step[4];
-  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
-  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = -step[10] + step[9];
-  output[11] = -step[11] + step[8];
-  output[12] = -step[12] + step[15];
-  output[13] = -step[13] + step[14];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
-  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
-  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
-  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
-  output[22] = step[22];
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = step[25];
-  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
-  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
-  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
-  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // Stage 5
-  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
-  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
-  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
-  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
-  step[4] = output[4] + output[5];
-  step[5] = -output[5] + output[4];
-  step[6] = -output[6] + output[7];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
-  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
-  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
-  step[15] = output[15];
-
-  step[16] = output[16] + output[19];
-  step[17] = output[17] + output[18];
-  step[18] = -output[18] + output[17];
-  step[19] = -output[19] + output[16];
-  step[20] = -output[20] + output[23];
-  step[21] = -output[21] + output[22];
-  step[22] = output[22] + output[21];
-  step[23] = output[23] + output[20];
-  step[24] = output[24] + output[27];
-  step[25] = output[25] + output[26];
-  step[26] = -output[26] + output[25];
-  step[27] = -output[27] + output[24];
-  step[28] = -output[28] + output[31];
-  step[29] = -output[29] + output[30];
-  step[30] = output[30] + output[29];
-  step[31] = output[31] + output[28];
-
-  // Stage 6
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
-  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
-  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
-  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
-  output[8] = step[8] + step[9];
-  output[9] = -step[9] + step[8];
-  output[10] = -step[10] + step[11];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = -step[13] + step[12];
-  output[14] = -step[14] + step[15];
-  output[15] = step[15] + step[14];
-
-  output[16] = step[16];
-  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
-  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
-  output[19] = step[19];
-  output[20] = step[20];
-  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
-  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
-  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
-  output[27] = step[27];
-  output[28] = step[28];
-  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
-  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
-  output[31] = step[31];
-
-  // Stage 7
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
-  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
-  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
-  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
-  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
-  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
-  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
-  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
-  step[16] = output[16] + output[17];
-  step[17] = -output[17] + output[16];
-  step[18] = -output[18] + output[19];
-  step[19] = output[19] + output[18];
-  step[20] = output[20] + output[21];
-  step[21] = -output[21] + output[20];
-  step[22] = -output[22] + output[23];
-  step[23] = output[23] + output[22];
-  step[24] = output[24] + output[25];
-  step[25] = -output[25] + output[24];
-  step[26] = -output[26] + output[27];
-  step[27] = output[27] + output[26];
-  step[28] = output[28] + output[29];
-  step[29] = -output[29] + output[28];
-  step[30] = -output[30] + output[31];
-  step[31] = output[31] + output[30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[0] = step[0];
-  output[16] = step[1];
-  output[8] = step[2];
-  output[24] = step[3];
-  output[4] = step[4];
-  output[20] = step[5];
-  output[12] = step[6];
-  output[28] = step[7];
-  output[2] = step[8];
-  output[18] = step[9];
-  output[10] = step[10];
-  output[26] = step[11];
-  output[6] = step[12];
-  output[22] = step[13];
-  output[14] = step[14];
-  output[30] = step[15];
-
-  output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
-  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
-  output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
-  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
-  output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
-  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
-  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
-  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
-  output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
-  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
-  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
-  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
-  output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
-  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
-  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
-  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
-}
-
-void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      // TODO(cd): see quality impact of only doing
-      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
-      //           PS: also change code in aom_dsp/x86/aom_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    aom_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  aom_fdct4x4_c(input, output, stride);
-}
-
 void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
                           int stride) {
   aom_fdct8x8_c(input, final_output, stride);
 }
-
-void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
-  aom_fdct16x16_c(input, output, stride);
-}
-
-void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  aom_fdct32x32_c(input, out, stride);
-}
-void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride) {
-  aom_fdct32x32_rd_c(input, out, stride);
-}
-
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h
deleted file mode 100644
index f4dc04ab4..000000000
--- a/third_party/aom/aom_dsp/fwd_txfm.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_FWD_TXFM_H_
-#define AOM_DSP_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-
-static INLINE tran_high_t saturate_int16(tran_high_t value) {
-  tran_high_t result;
-  result = value > INT16_MAX ? INT16_MAX : value;
-  return result < INT16_MIN ? INT16_MIN : result;
-}
-
-void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // AOM_DSP_FWD_TXFM_H_
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
new file mode 100644
index 000000000..fcb6c290e
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_synthesis.c
@@ -0,0 +1,1392 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_mem/aom_mem.h"
+
+// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
+// with zero mean and standard deviation of about 512.
+// should be divided by 4 for 10-bit range and 16 for 8-bit range.
+static const int gaussian_sequence[2048] = {
+  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+  428,   -484
+};
+
+static const int gauss_bits = 11;
+
+static int luma_subblock_size_y = 32;
+static int luma_subblock_size_x = 32;
+
+static int chroma_subblock_size_y = 16;
+static int chroma_subblock_size_x = 16;
+
+static const int min_luma_legal_range = 16;
+static const int max_luma_legal_range = 235;
+
+static const int min_chroma_legal_range = 16;
+static const int max_chroma_legal_range = 240;
+
+static int scaling_lut_y[256];
+static int scaling_lut_cb[256];
+static int scaling_lut_cr[256];
+
+static int grain_center;
+static int grain_min;
+static int grain_max;
+
+static uint16_t random_register = 0;  // random number generator register
+
+static void init_arrays(aom_film_grain_t *params, int luma_stride,
+                        int chroma_stride, int ***pred_pos_luma_p,
+                        int ***pred_pos_chroma_p, int **luma_grain_block,
+                        int **cb_grain_block, int **cr_grain_block,
+                        int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
+                        int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
+                        int luma_grain_samples, int chroma_grain_samples,
+                        int chroma_subsamp_y, int chroma_subsamp_x) {
+  memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
+  memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
+  memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+
+  pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
+  }
+
+  pred_pos_chroma =
+      (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
+  }
+
+  int pos_ar_index = 0;
+
+  for (int row = -params->ar_coeff_lag; row < 0; row++) {
+    for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
+         col++) {
+      pred_pos_luma[pos_ar_index][0] = row;
+      pred_pos_luma[pos_ar_index][1] = col;
+      pred_pos_luma[pos_ar_index][2] = 0;
+
+      pred_pos_chroma[pos_ar_index][0] = row;
+      pred_pos_chroma[pos_ar_index][1] = col;
+      pred_pos_chroma[pos_ar_index][2] = 0;
+      ++pos_ar_index;
+    }
+  }
+
+  for (int col = -params->ar_coeff_lag; col < 0; col++) {
+    pred_pos_luma[pos_ar_index][0] = 0;
+    pred_pos_luma[pos_ar_index][1] = col;
+    pred_pos_luma[pos_ar_index][2] = 0;
+
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = col;
+    pred_pos_chroma[pos_ar_index][2] = 0;
+
+    ++pos_ar_index;
+  }
+
+  if (params->num_y_points > 0) {
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = 0;
+    pred_pos_chroma[pos_ar_index][2] = 1;
+  }
+
+  *pred_pos_luma_p = pred_pos_luma;
+  *pred_pos_chroma_p = pred_pos_chroma;
+
+  *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
+  *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+  *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+
+  *y_col_buf =
+      (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
+  *cb_col_buf =
+      (int *)aom_malloc(sizeof(**cb_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+  *cr_col_buf =
+      (int *)aom_malloc(sizeof(**cr_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+
+  *luma_grain_block =
+      (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
+  *cb_grain_block =
+      (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
+  *cr_grain_block =
+      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
+}
+
+static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma,
+                           int ***pred_pos_chroma, int **luma_grain_block,
+                           int **cb_grain_block, int **cr_grain_block,
+                           int **y_line_buf, int **cb_line_buf,
+                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
+                           int **cr_col_buf) {
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    aom_free((*pred_pos_luma)[row]);
+  }
+  aom_free(*pred_pos_luma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    aom_free((*pred_pos_chroma)[row]);
+  }
+  aom_free((*pred_pos_chroma));
+
+  aom_free(*y_line_buf);
+
+  aom_free(*cb_line_buf);
+
+  aom_free(*cr_line_buf);
+
+  aom_free(*y_col_buf);
+
+  aom_free(*cb_col_buf);
+
+  aom_free(*cr_col_buf);
+
+  aom_free(*luma_grain_block);
+
+  aom_free(*cb_grain_block);
+
+  aom_free(*cr_grain_block);
+}
+
+// get a number between 0 and 2^bits - 1
+static INLINE int get_random_number(int bits) {
+  uint16_t bit;
+  bit = ((random_register >> 0) ^ (random_register >> 1) ^
+         (random_register >> 3) ^ (random_register >> 12)) &
+        1;
+  random_register = (random_register >> 1) | (bit << 15);
+  return (random_register >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static void init_random_generator(int luma_line, uint16_t seed) {
+  // same for the picture
+
+  uint16_t msb = (seed >> 8) & 255;
+  uint16_t lsb = seed & 255;
+
+  random_register = (msb << 8) + lsb;
+
+  //  changes for each row
+  int luma_num = luma_line >> 5;
+
+  random_register ^= ((luma_num * 37 + 178) & 255) << 8;
+  random_register ^= ((luma_num * 173 + 105) & 255);
+}
+
+static void generate_luma_grain_block(
+    aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+    int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
+    int left_pad, int top_pad, int right_pad, int bottom_pad) {
+  if (params->num_y_points == 0) return;
+
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+
+  for (int i = 0; i < luma_block_size_y; i++)
+    for (int j = 0; j < luma_block_size_x; j++)
+      luma_grain_block[i * luma_grain_stride + j] =
+          (gaussian_sequence[get_random_number(gauss_bits)] +
+           ((1 << gauss_sec_shift) >> 1)) >>
+          gauss_sec_shift;
+
+  for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
+      int wsum = 0;
+      for (int pos = 0; pos < num_pos_luma; pos++) {
+        wsum = wsum + params->ar_coeffs_y[pos] *
+                          luma_grain_block[(i + pred_pos_luma[pos][0]) *
+                                               luma_grain_stride +
+                                           j + pred_pos_luma[pos][1]];
+      }
+      luma_grain_block[i * luma_grain_stride + j] =
+          clamp(luma_grain_block[i * luma_grain_stride + j] +
+                    ((wsum + rounding_offset) >> params->ar_coeff_shift),
+                grain_min, grain_max);
+    }
+}
+
+static void generate_chroma_grain_blocks(
+    aom_film_grain_t *params,
+    //                                  int** pred_pos_luma,
+    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
+    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
+    int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
+    int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  if (params->num_y_points > 0) ++num_pos_chroma;
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+  int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x;
+
+  if (params->num_cb_points || params->chroma_scaling_from_luma) {
+    init_random_generator(7 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples);
+  }
+
+  if (params->num_cr_points || params->chroma_scaling_from_luma) {
+    init_random_generator(11 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples);
+  }
+
+  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
+      int wsum_cb = 0;
+      int wsum_cr = 0;
+      for (int pos = 0; pos < num_pos_chroma; pos++) {
+        if (pred_pos_chroma[pos][2] == 0) {
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
+                                  cb_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
+                                  cr_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+        } else if (pred_pos_chroma[pos][2] == 1) {
+          int av_luma = 0;
+          int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
+          int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
+
+          for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
+               k++)
+            for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
+                 l++)
+              av_luma += luma_grain_block[k * luma_grain_stride + l];
+
+          av_luma =
+              (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
+              (chroma_subsamp_y + chroma_subsamp_x);
+
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
+        } else {
+          printf(
+              "Grain synthesis: prediction between two chroma components is "
+              "not supported!");
+          exit(1);
+        }
+      }
+      if (params->num_cb_points || params->chroma_scaling_from_luma)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            clamp(cb_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+      if (params->num_cr_points || params->chroma_scaling_from_luma)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            clamp(cr_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+    }
+}
+
+static void init_scaling_function(int scaling_points[][2], int num_points,
+                                  int scaling_lut[]) {
+  if (num_points == 0) return;
+
+  for (int i = 0; i < scaling_points[0][0]; i++)
+    scaling_lut[i] = scaling_points[0][1];
+
+  for (int point = 0; point < num_points - 1; point++) {
+    int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
+    int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
+
+    int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+
+    for (int x = 0; x < delta_x; x++) {
+      scaling_lut[scaling_points[point][0] + x] =
+          scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
+    }
+  }
+
+  for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
+    scaling_lut[i] = scaling_points[num_points - 1][1];
+}
+
+// function that extracts samples from a LUT (and interpolates intemediate
+// frames for 10- and 12-bit video)
+static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
+  int x = index >> (bit_depth - 8);
+
+  if (!(bit_depth - 8) || x == 255)
+    return scaling_lut[x];
+  else
+    return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
+                                  (index & ((1 << (bit_depth - 8)) - 1)) +
+                              (1 << (bit_depth - 9))) >>
+                             (bit_depth - 8));
+}
+
+static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma,
+                               uint8_t *cb, uint8_t *cr, int luma_stride,
+                               int chroma_stride, int *luma_grain,
+                               int *cb_grain, int *cr_grain,
+                               int luma_grain_stride, int chroma_grain_stride,
+                               int half_luma_height, int half_luma_width,
+                               int bit_depth, int chroma_subsamp_y,
+                               int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  int cb_offset = params->cb_offset - 256;
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  int cr_offset = params->cr_offset - 256;
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range;
+    max_luma = max_luma_legal_range;
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range;
+      max_chroma = max_luma_legal_range;
+    } else {
+      min_chroma = min_chroma_legal_range;
+      max_chroma = max_chroma_legal_range;
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = 255;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void add_noise_to_block_hbd(
+    aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+    int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
+    int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
+    int half_luma_height, int half_luma_width, int bit_depth,
+    int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range << (bit_depth - 8);
+    max_luma = max_luma_legal_range << (bit_depth - 8);
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range << (bit_depth - 8);
+      max_chroma = max_luma_legal_range << (bit_depth - 8);
+    } else {
+      min_chroma = min_chroma_legal_range << (bit_depth - 8);
+      max_chroma = max_chroma_legal_range << (bit_depth - 8);
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
+                                  bit_depth) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
+                      int dst_stride, int width, int height,
+                      int use_high_bit_depth) {
+  int hbd_coeff = use_high_bit_depth ? 2 : 1;
+  while (height) {
+    memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
+                      int width, int height) {
+  while (height) {
+    memcpy(dst, src, width * sizeof(*src));
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
+                        int use_high_bit_depth) {
+  if ((width & 1) == 0 && (height & 1) == 0) return;
+  if (use_high_bit_depth) {
+    uint16_t *dst16 = (uint16_t *)dst;
+    int dst16_stride = dst_stride / 2;
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
+             sizeof(*dst16) * width);
+    }
+  } else {
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
+             sizeof(*dst) * width);
+    }
+  }
+}
+
+static void ver_boundary_overlap(int *left_block, int left_stride,
+                                 int *right_block, int right_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (width == 1) {
+    while (height) {
+      *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  } else if (width == 2) {
+    while (height) {
+      dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
+                           grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  }
+}
+
+static void hor_boundary_overlap(int *top_block, int top_stride,
+                                 int *bottom_block, int bottom_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (height == 1) {
+    while (width) {
+      *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  } else if (height == 2) {
+    while (width) {
+      dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
+                                     27 * bottom_block[bottom_stride] + 16) >>
+                                        5,
+                                    grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  }
+}
+
+void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src,
+                        aom_image_t *dst) {
+  uint8_t *luma, *cb, *cr;
+  int height, width, luma_stride, chroma_stride;
+  int use_high_bit_depth = 0;
+  int chroma_subsamp_x = 0;
+  int chroma_subsamp_y = 0;
+  int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
+
+  switch (src->fmt) {
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_I420:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+    case AOM_IMG_FMT_I42016:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+      //    case AOM_IMG_FMT_444A:
+    case AOM_IMG_FMT_I444:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I44416:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I422:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I42216:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    default:  // unknown input format
+      printf("Film grain error: input format is not supported!");
+      exit(1);
+  }
+
+  dst->r_w = src->r_w;
+  dst->r_h = src->r_h;
+  dst->d_w = src->d_w;
+  dst->d_h = src->d_h;
+
+  dst->cp = src->cp;
+  dst->tc = src->tc;
+  dst->mc = src->mc;
+
+  dst->monochrome = src->monochrome;
+  dst->csp = src->csp;
+  dst->range = src->range;
+
+  dst->x_chroma_shift = src->x_chroma_shift;
+  dst->y_chroma_shift = src->y_chroma_shift;
+
+  dst->temporal_id = src->temporal_id;
+  dst->spatial_id = src->spatial_id;
+
+  width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
+  height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
+
+  copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+            dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+            src->d_h, use_high_bit_depth);
+  // Note that dst is already assumed to be aligned to even.
+  extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+              src->d_h, use_high_bit_depth);
+
+  if (!src->monochrome) {
+    copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+              dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+
+    copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
+              dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+  }
+
+  luma = dst->planes[AOM_PLANE_Y];
+  cb = dst->planes[AOM_PLANE_U];
+  cr = dst->planes[AOM_PLANE_V];
+
+  // luma and chroma strides in samples
+  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
+  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
+
+  params->bit_depth = dst->bit_depth;
+
+  av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride,
+                         chroma_stride, use_high_bit_depth, chroma_subsamp_y,
+                         chroma_subsamp_x, mc_identity);
+  return;
+}
+
+void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma,
+                            uint8_t *cb, uint8_t *cr, int height, int width,
+                            int luma_stride, int chroma_stride,
+                            int use_high_bit_depth, int chroma_subsamp_y,
+                            int chroma_subsamp_x, int mc_identity) {
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+  int *luma_grain_block;
+  int *cb_grain_block;
+  int *cr_grain_block;
+
+  int *y_line_buf;
+  int *cb_line_buf;
+  int *cr_line_buf;
+
+  int *y_col_buf;
+  int *cb_col_buf;
+  int *cr_col_buf;
+
+  random_register = params->random_seed;
+
+  int left_pad = 3;
+  int right_pad = 3;  // padding to offset for AR coefficients
+  int top_pad = 3;
+  int bottom_pad = 0;
+
+  int ar_padding = 3;  // maximum lag used for stabilization of AR coefficients
+
+  luma_subblock_size_y = 32;
+  luma_subblock_size_x = 32;
+
+  chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
+  chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
+
+  // Initial padding is only needed for generation of
+  // film grain templates (to stabilize the AR process)
+  // Only a 64x64 luma and 32x32 chroma part of a template
+  // is used later for adding grain, padding can be discarded
+
+  int luma_block_size_y =
+      top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
+  int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
+                          2 * ar_padding + right_pad;
+
+  int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            chroma_subblock_size_y * 2 + bottom_pad;
+  int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            chroma_subblock_size_x * 2 +
+                            (2 >> chroma_subsamp_x) * ar_padding + right_pad;
+
+  int luma_grain_stride = luma_block_size_x;
+  int chroma_grain_stride = chroma_block_size_x;
+
+  int overlap = params->overlap_flag;
+  int bit_depth = params->bit_depth;
+
+  grain_center = 128 << (bit_depth - 8);
+  grain_min = 0 - grain_center;
+  grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
+
+  init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
+              &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
+              &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
+              &y_col_buf, &cb_col_buf, &cr_col_buf,
+              luma_block_size_y * luma_block_size_x,
+              chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
+              chroma_subsamp_x);
+
+  generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+                            luma_block_size_y, luma_block_size_x,
+                            luma_grain_stride, left_pad, top_pad, right_pad,
+                            bottom_pad);
+
+  generate_chroma_grain_blocks(
+      params,
+      //                               pred_pos_luma,
+      pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
+      luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
+      chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
+      chroma_subsamp_y, chroma_subsamp_x);
+
+  init_scaling_function(params->scaling_points_y, params->num_y_points,
+                        scaling_lut_y);
+
+  if (params->chroma_scaling_from_luma) {
+    memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+    memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+  } else {
+    init_scaling_function(params->scaling_points_cb, params->num_cb_points,
+                          scaling_lut_cb);
+    init_scaling_function(params->scaling_points_cr, params->num_cr_points,
+                          scaling_lut_cr);
+  }
+  for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
+    init_random_generator(y * 2, params->random_seed);
+
+    for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
+      int offset_y = get_random_number(8);
+      int offset_x = (offset_y >> 4) & 15;
+      offset_y &= 15;
+
+      int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
+      int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
+
+      int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            offset_y * (2 >> chroma_subsamp_y);
+      int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            offset_x * (2 >> chroma_subsamp_x);
+
+      if (overlap && x) {
+        ver_boundary_overlap(
+            y_col_buf, 2,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x,
+            luma_grain_stride, y_col_buf, 2, 2,
+            AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        ver_boundary_overlap(
+            cb_col_buf, 2 >> chroma_subsamp_x,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        ver_boundary_overlap(
+            cr_col_buf, 2 >> chroma_subsamp_x,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        int i = y ? 1 : 0;
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params,
+              (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              (uint16_t *)cr +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + ((y + i) << 1) * luma_stride + (x << 1),
+              cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      if (overlap && y) {
+        if (x) {
+          hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
+                               y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                               cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+
+          hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                               cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+        }
+
+        hor_boundary_overlap(
+            y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x + (x ? 2 : 0),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
+                   width - ((x ? x + 1 : 0) << 1)),
+            2);
+
+        hor_boundary_overlap(
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        hor_boundary_overlap(
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + (y << 1) * luma_stride + (x << 1),
+              cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      int i = overlap && y ? 1 : 0;
+      int j = overlap && x ? 1 : 0;
+
+      if (use_high_bit_depth) {
+        add_noise_to_block_hbd(
+            params,
+            (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            (uint16_t *)cb +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            (uint16_t *)cr +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      } else {
+        add_noise_to_block(
+            params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      }
+
+      if (overlap) {
+        if (x) {
+          // Copy overlapped column bufer to line buffer
+          copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
+                    y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          copy_area(
+              cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+
+          copy_area(
+              cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+        }
+
+        // Copy grain to the line buffer for overlap with a bottom block
+        copy_area(
+            luma_grain_block +
+                (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
+                luma_offset_x + ((x ? 2 : 0)),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
+
+        copy_area(cb_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        copy_area(cr_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        // Copy grain to the column buffer for overlap with the next block to
+        // the right
+
+        copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
+                      luma_offset_x + luma_subblock_size_x,
+                  luma_grain_stride, y_col_buf, 2, 2,
+                  AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+
+        copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+      }
+    }
+  }
+
+  dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
+                 &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
+                 &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
+}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
new file mode 100644
index 000000000..016cb12d7
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_synthesis.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+#ifndef AOM_AOM_GRAIN_SYNTHESIS_H_
+#define AOM_AOM_GRAIN_SYNTHESIS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom/aom_image.h"
+
+/*!\brief Structure containing film grain synthesis parameters for a frame
+ *
+ * This structure contains input parameters for film grain synthesis
+ */
+typedef struct {
+  int apply_grain;
+
+  int update_parameters;
+
+  // 8 bit values
+  int scaling_points_y[14][2];
+  int num_y_points;  // value: 0..14
+
+  // 8 bit values
+  int scaling_points_cb[10][2];
+  int num_cb_points;  // value: 0..10
+
+  // 8 bit values
+  int scaling_points_cr[10][2];
+  int num_cr_points;  // value: 0..10
+
+  int scaling_shift;  // values : 8..11
+
+  int ar_coeff_lag;  // values:  0..3
+
+  // 8 bit values
+  int ar_coeffs_y[24];
+  int ar_coeffs_cb[25];
+  int ar_coeffs_cr[25];
+
+  // Shift value: AR coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  int ar_coeff_shift;  // values : 6..9
+
+  int cb_mult;       // 8 bits
+  int cb_luma_mult;  // 8 bits
+  int cb_offset;     // 9 bits
+
+  int cr_mult;       // 8 bits
+  int cr_luma_mult;  // 8 bits
+  int cr_offset;     // 9 bits
+
+  int overlap_flag;
+
+  int clip_to_restricted_range;
+
+  int bit_depth;  // video bit depth
+
+  int chroma_scaling_from_luma;
+
+  int grain_scale_shift;
+
+  uint16_t random_seed;
+} aom_film_grain_t;
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    luma             luma plane
+ * \param[in]    cb               cb plane
+ * \param[in]    cr               cr plane
+ * \param[in]    height           luma plane height
+ * \param[in]    width            luma plane width
+ * \param[in]    luma_stride      luma plane stride
+ * \param[in]    chroma_stride    chroma plane stride
+ */
+void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma,
+                            uint8_t *cb, uint8_t *cr, int height, int width,
+                            int luma_stride, int chroma_stride,
+                            int use_high_bit_depth, int chroma_subsamp_y,
+                            int chroma_subsamp_x, int mc_identity);
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    src              Source image
+ * \param[in]    dst              Resulting image with grain
+ */
+void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src,
+                        aom_image_t *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
new file mode 100644
index 000000000..0d6a73f55
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This file has the implementation details of the grain table.
+ *
+ * The file format is an ascii representation for readability and
+ * editability. Array parameters are separated from the non-array
+ * parameters and prefixed with a few characters to make for easy
+ * localization with a parameter set. Each entry is prefixed with "E"
+ * and the other parameters are only specified if "update-parms" is
+ * non-zero.
+ *
+ * filmgrn1
+ * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
+ *  p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
+ *  sY <num_y_points> <point_0_x> <point_0_y> ...
+ *  sCb <num_cb_points> <point_0_x> <point_0_y> ...
+ *  sCr <num_cr_points> <point_0_x> <point_0_y> ...
+ *  cY <ar_coeff_y_0> ....
+ *  cCb <ar_coeff_cb_0> ....
+ *  cCr <ar_coeff_cr_0> ....
+ * E <start-time> ...
+ */
+#include <string.h>
+#include <stdio.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_mem/aom_mem.h"
+
+static const char kFileMagic[8] = "filmgrn1";
+
+static void grain_table_entry_read(FILE *file,
+                                   struct aom_internal_error_info *error_info,
+                                   aom_film_grain_table_entry_t *entry) {
+  aom_film_grain_t *pars = &entry->params;
+  int num_read =
+      fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
+             &entry->end_time, &pars->apply_grain, &pars->random_seed,
+             &pars->update_parameters);
+  if (num_read == 0 && feof(file)) return;
+  if (num_read != 5) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read entry header. Read %d != 5", num_read);
+    return;
+  }
+  if (pars->update_parameters) {
+    num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
+                      &pars->ar_coeff_lag, &pars->ar_coeff_shift,
+                      &pars->grain_scale_shift, &pars->scaling_shift,
+                      &pars->chroma_scaling_from_luma, &pars->overlap_flag,
+                      &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
+                      &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
+    if (num_read != 12) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read entry params. Read %d != 12",
+                         num_read);
+      return;
+    }
+    if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num y points");
+      return;
+    }
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
+                      &pars->scaling_points_y[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read y scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cb points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
+                      &pars->scaling_points_cb[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cb scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cr points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
+                      &pars->scaling_points_cr[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cr scaling points");
+        return;
+      }
+    }
+
+    fscanf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Y coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cb coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cr coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n");
+  }
+}
+
+void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) {
+  const aom_film_grain_t *pars = &entry->params;
+  fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
+          entry->end_time, pars->apply_grain, pars->random_seed,
+          pars->update_parameters);
+  if (pars->update_parameters) {
+    fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
+            pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
+            pars->scaling_shift, pars->chroma_scaling_from_luma,
+            pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
+            pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
+            pars->cr_offset);
+    fprintf(file, "\tsY %d ", pars->num_y_points);
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_y[i][0],
+              pars->scaling_points_y[i][1]);
+    }
+    fprintf(file, "\n\tsCb %d", pars->num_cb_points);
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
+              pars->scaling_points_cb[i][1]);
+    }
+    fprintf(file, "\n\tsCr %d", pars->num_cr_points);
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
+              pars->scaling_points_cr[i][1]);
+    }
+    fprintf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_y[i]);
+    }
+    fprintf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cb[i]);
+    }
+    fprintf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cr[i]);
+    }
+    fprintf(file, "\n");
+  }
+}
+
+void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
+                                 int64_t end_time,
+                                 const aom_film_grain_t *grain) {
+  if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
+    aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+    memset(new_tail, 0, sizeof(*new_tail));
+    if (t->tail) t->tail->next = new_tail;
+    if (!t->head) t->head = new_tail;
+    t->tail = new_tail;
+
+    new_tail->start_time = time_stamp;
+    new_tail->end_time = end_time;
+    new_tail->params = *grain;
+  } else {
+    t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
+    t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
+  }
+}
+
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  int16_t random_seed = grain ? grain->random_seed : 0;
+  if (grain) memset(grain, 0, sizeof(*grain));
+
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
+      if (grain) {
+        *grain = entry->params;
+        if (time_stamp != 0) grain->random_seed = random_seed;
+      }
+      if (!erase) return 1;
+
+      const int64_t entry_end_time = entry->end_time;
+      if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
+        if (t->tail == entry) t->tail = prev_entry;
+        if (prev_entry) {
+          prev_entry->next = entry->next;
+        } else {
+          t->head = entry->next;
+        }
+        aom_free(entry);
+      } else if (time_stamp <= entry->start_time &&
+                 end_time < entry->end_time) {
+        entry->start_time = end_time;
+      } else if (time_stamp > entry->start_time &&
+                 end_time >= entry->end_time) {
+        entry->end_time = time_stamp;
+      } else {
+        aom_film_grain_table_entry_t *new_entry =
+            aom_malloc(sizeof(*new_entry));
+        new_entry->next = entry->next;
+        new_entry->start_time = end_time;
+        new_entry->end_time = entry->end_time;
+        new_entry->params = entry->params;
+        entry->next = new_entry;
+        entry->end_time = time_stamp;
+        if (t->tail == entry) t->tail = new_entry;
+      }
+      // If segments aren't aligned, delete from the beggining of subsequent
+      // segments
+      if (end_time > entry_end_time) {
+        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+      }
+      return 1;
+    }
+    prev_entry = entry;
+    entry = next;
+  }
+  return 0;
+}
+
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  FILE *file = fopen(filename, "rb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
+                       filename);
+    return error_info->error_code;
+  }
+  error_info->error_code = AOM_CODEC_OK;
+
+  // Read in one extra character as there should be white space after
+  // the header.
+  char magic[9];
+  if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read (or invalid) file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  while (!feof(file)) {
+    aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+    memset(entry, 0, sizeof(*entry));
+    grain_table_entry_read(file, error_info, entry);
+    entry->next = 0;
+
+    if (prev_entry) prev_entry->next = entry;
+    if (!t->head) t->head = entry;
+    t->tail = entry;
+    prev_entry = entry;
+
+    if (error_info->error_code != AOM_CODEC_OK) break;
+  }
+
+  fclose(file);
+  return error_info->error_code;
+}
+
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  error_info->error_code = AOM_CODEC_OK;
+
+  FILE *file = fopen(filename, "wb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
+                       filename);
+    return error_info->error_code;
+  }
+
+  if (!fwrite(kFileMagic, 8, 1, file)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to write file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  fprintf(file, "\n");
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    grain_table_entry_write(file, entry);
+    entry = entry->next;
+  }
+  fclose(file);
+  return error_info->error_code;
+}
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    aom_free(entry);
+    entry = next;
+  }
+  memset(t, 0, sizeof(*t));
+}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
new file mode 100644
index 000000000..5c20413b2
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief A table mapping from time to corresponding film grain parameters.
+ *
+ * In order to apply grain synthesis in the decoder, the film grain parameters
+ * need to be signalled in the encoder. The film grain parameters are time
+ * varying, and for two-pass encoding (and denoiser implementation flexibility)
+ * it is common to denoise the video and do parameter estimation before encoding
+ * the denoised video.
+ *
+ * The film grain table is used to provide this flexibility and is used as a
+ * parameter that is passed to the encoder.
+ *
+ * Further, if regraining is to be done in say a single pass mode, or in two
+ * pass within the encoder (before frames are added to the lookahead buffer),
+ * this data structure can be used to keep track of on-the-fly estimated grain
+ * parameters, that are then extracted from the table before the encoded frame
+ * is written.
+ */
+#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
+#define AOM_AOM_DSP_GRAIN_TABLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/grain_synthesis.h"
+#include "aom/internal/aom_codec_internal.h"
+
+typedef struct aom_film_grain_table_entry_t {
+  aom_film_grain_t params;
+  int64_t start_time;
+  int64_t end_time;
+  struct aom_film_grain_table_entry_t *next;
+} aom_film_grain_table_entry_t;
+
+typedef struct {
+  aom_film_grain_table_entry_t *head;
+  aom_film_grain_table_entry_t *tail;
+} aom_film_grain_table_t;
+
+/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
+ * parameters
+ *
+ * \param[in/out] table      The grain table
+ * \param[in]     time_stamp The start time stamp
+ * \param[in]     end_stamp  The end time_stamp
+ * \param[in]     grain      The grain parameters
+ */
+void aom_film_grain_table_append(aom_film_grain_table_t *table,
+                                 int64_t time_stamp, int64_t end_time,
+                                 const aom_film_grain_t *grain);
+
+/*!\brief Look-up (and optionally erase) the grain parameters for the given time
+ *
+ * \param[in]  table      The grain table
+ * \param[in]  time_stamp The start time stamp
+ * \param[in]  end_stamp  The end time_stamp
+ * \param[in]  erase      Whether the time segment can be deleted
+ * \param[out] grain      The output grain parameters
+ */
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain);
+
+/*!\brief Reads the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *table, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+/*!\brief Writes the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
index 6d2ac37d9..c6aa6b207 100644
--- a/third_party/aom/aom_dsp/intrapred.c
+++ b/third_party/aom/aom_dsp/intrapred.c
@@ -12,152 +12,14 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/intrapred_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 
-#define DST(x, y) dst[(x) + (y)*stride]
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
-#define AVG2(a, b) (((a) + (b) + 1) >> 1)
-
-static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                   int bh, const uint8_t *above,
-                                   const uint8_t *left) {
-  int r, c;
-  (void)above;
-
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bw + bh)]);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-
-  // first row
-  for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]);
-  dst += stride;
-
-  // second row
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-  dst += stride;
-
-  // the rest of first col
-  dst[0] = AVG3(above[-1], left[0], left[1]);
-  for (r = 3; r < bh; ++r)
-    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
-
-  // the rest of the block
-  for (r = 2; r < bh; ++r) {
-    for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
-}
-
-static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int i;
-#if CONFIG_TX64X64
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
-  // silence a spurious -Warray-bounds warning, possibly related to:
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
-  uint8_t border[133];
-#else
-  uint8_t border[64 + 64 - 1];  // outer border from bottom-left to top-right
-#endif
-#else
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
-  // silence a spurious -Warray-bounds warning, possibly related to:
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
-  uint8_t border[69];
-#else
-  uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
-#endif
-#endif  // CONFIG_TX64X64
-
-  // dst(bh, bh - 2)[0], i.e., border starting at bottom-left
-  for (i = 0; i < bh - 2; ++i) {
-    border[i] = AVG3(left[bh - 3 - i], left[bh - 2 - i], left[bh - 1 - i]);
-  }
-  border[bh - 2] = AVG3(above[-1], left[0], left[1]);
-  border[bh - 1] = AVG3(left[0], above[-1], above[0]);
-  border[bh - 0] = AVG3(above[-1], above[0], above[1]);
-  // dst[0][2, size), i.e., remaining top border ascending
-  for (i = 0; i < bw - 2; ++i) {
-    border[bh + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
-  }
-
-  for (i = 0; i < bh; ++i) {
-    memcpy(dst + i * stride, border + bh - 1 - i, bw);
-  }
-}
-
-static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                  int bh, const uint8_t *above,
-                                  const uint8_t *left) {
-  int r, c;
-  dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
-  dst++;
-
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bh; r++)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-  dst++;
-
-  for (c = 0; c < bw - 2; c++)
-    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
-  dst += stride;
-
-  for (r = 1; r < bh; ++r) {
-    for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
-}
-
 static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
                                const uint8_t *above, const uint8_t *left) {
   int r;
@@ -244,13 +106,12 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
       for (i = 0; i < 4; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
 
-#if CONFIG_SMOOTH_HV
 static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint8_t *above,
                                       const uint8_t *left) {
@@ -274,7 +135,7 @@ static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
@@ -303,12 +164,11 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel(divide_round(this_pred, log2_scale));
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
-#endif  // CONFIG_SMOOTH_HV
 
 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
                                     int bh, const uint8_t *above,
@@ -373,267 +233,133 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   }
 }
 
-void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  (void)left;
-
-  DST(0, 0) = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
-  DST(1, 1) = AVG3(C, D, D);
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
 }
 
-void aom_d117_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  DST(0, 0) = AVG2(X, A);
-  DST(1, 0) = AVG2(A, B);
-  DST(0, 1) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(X, A, B);
-}
+  // The constants (multiplier and shifts) for a given block size are obtained
+  // as follows:
+  // - Let sum_w_h =  block width + block height.
+  // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
+  // shifts for that block size be called 'shift1' (see the parameter in
+  // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
+  // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
+  // block].
+  // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
+  // using the "Algorithm 1" in:
+  // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+  // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+  // shift will be 16, regardless of the block size.
 
-void aom_d135_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  (void)stride;
-  DST(0, 1) = AVG3(X, I, J);
-  DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-  DST(1, 0) = AVG3(B, A, X);
-}
+  // Note: For low bitdepth, assembly code may be optimized by using smaller
+  // constants for smaller block sizes, where the range of the 'sum' is
+  // restricted to fewer bits.
 
-void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-
-  DST(0, 0) = AVG2(I, X);
-  DST(0, 1) = AVG2(J, I);
-  DST(1, 0) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(J, I, X);
-}
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
 
-void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  const int E = above[4];
-  const int F = above[5];
-  const int G = above[6];
-  const int H = above[7];
-  (void)stride;
-  (void)left;
-  DST(0, 0) = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
-  DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
-  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-  DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
-  DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-  DST(3, 3) = AVG3(G, H, H);
-}
+#define DC_SHIFT2 16
 
-void aom_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  DST(0, 0) = DST(1, 2) = AVG2(X, A);
-  DST(1, 0) = DST(2, 2) = AVG2(A, B);
-  DST(2, 0) = DST(3, 2) = AVG2(B, C);
-  DST(3, 0) = AVG2(C, D);
-
-  DST(0, 3) = AVG3(K, J, I);
-  DST(0, 2) = AVG3(J, I, X);
-  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
-  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
-  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
-  DST(3, 1) = AVG3(B, C, D);
-}
+static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left, int shift1,
+                                     int multiplier) {
+  int sum = 0;
 
-void aom_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  DST(0, 3) = AVG3(J, K, L);
-  DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
-  DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
-  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-  DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
-  DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
-  DST(3, 0) = AVG3(D, C, B);
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+
+  for (int r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
 }
 
-void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-
-  DST(0, 0) = DST(2, 1) = AVG2(I, X);
-  DST(0, 1) = DST(2, 2) = AVG2(J, I);
-  DST(0, 2) = DST(2, 3) = AVG2(K, J);
-  DST(0, 3) = AVG2(L, K);
-
-  DST(3, 0) = AVG3(A, B, C);
-  DST(2, 0) = AVG3(X, A, B);
-  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
-  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
-  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
-  DST(1, 3) = AVG3(L, K, J);
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
 }
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bw, int bh, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  int r, c;
-  (void)above;
-  (void)bd;
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
+}
 
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
 }
 
-static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
 }
 
-static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bw + bh)]);
-    }
-    dst += stride;
-  }
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
 }
 
-static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)bd;
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
+}
 
-  // first row
-  for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]);
-  dst += stride;
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
+}
 
-  // second row
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
-  dst += stride;
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
+}
 
-  // the rest of first col
-  dst[0] = AVG3(above[-1], left[0], left[1]);
-  for (r = 3; r < bh; ++r)
-    dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]);
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
+}
 
-  // the rest of the block
-  for (r = 2; r < bh; ++r) {
-    for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1];
-    dst += stride;
-  }
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
 }
 
-static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)bd;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
+}
 
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bh; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
+}
 
-  dst += stride;
-  for (r = 1; r < bh; ++r) {
-    for (c = 1; c < bw; c++) dst[c] = dst[-stride + c - 1];
-    dst += stride;
-  }
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
 }
 
-static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bw, int bh, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)bd;
-  dst[0] = AVG2(above[-1], left[0]);
-  for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]);
-  dst++;
-
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bh; r++)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
-  dst++;
-
-  for (c = 0; c < bw - 2; c++)
-    dst[c] = AVG3(above[c - 1], above[c], above[c + 1]);
-  dst += stride;
-
-  for (r = 1; r < bh; ++r) {
-    for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2];
-    dst += stride;
-  }
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
 }
 
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
 static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
                                       int bh, const uint16_t *above,
                                       const uint16_t *left, int bd) {
@@ -658,93 +384,6 @@ static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   }
 }
 
-void aom_highbd_d207_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  (void)above;
-  (void)bd;
-  DST(0, 0) = AVG2(I, J);
-  DST(0, 1) = AVG2(J, K);
-  DST(1, 0) = AVG3(I, J, K);
-  DST(1, 1) = AVG3(J, K, L);
-}
-
-void aom_highbd_d63_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)left;
-  (void)bd;
-  DST(0, 0) = AVG2(A, B);
-  DST(1, 0) = AVG2(B, C);
-  DST(0, 1) = AVG3(A, B, C);
-  DST(1, 1) = AVG3(B, C, D);
-}
-
-void aom_highbd_d45e_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int A = above[0];
-  const int B = above[1];
-  const int C = above[2];
-  const int D = above[3];
-  (void)stride;
-  (void)left;
-  (void)bd;
-  DST(0, 0) = AVG3(A, B, C);
-  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
-  DST(1, 1) = AVG3(C, D, D);
-}
-
-void aom_highbd_d117_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  (void)bd;
-  DST(0, 0) = AVG2(X, A);
-  DST(1, 0) = AVG2(A, B);
-  DST(0, 1) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(X, A, B);
-}
-
-void aom_highbd_d135_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-  const int B = above[1];
-  (void)bd;
-  DST(0, 1) = AVG3(X, I, J);
-  DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
-  DST(1, 0) = AVG3(B, A, X);
-}
-
-void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int X = above[-1];
-  const int A = above[0];
-  (void)bd;
-  DST(0, 0) = AVG2(I, X);
-  DST(0, 1) = AVG2(J, I);
-  DST(1, 0) = AVG3(I, X, A);
-  DST(1, 1) = AVG3(J, I, X);
-}
-
 static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bw, int bh, const uint16_t *above,
                                           const uint16_t *left, int bd) {
@@ -763,6 +402,7 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
+  (void)bd;
   const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
@@ -785,17 +425,17 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
       for (i = 0; i < 4; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
 
-#if CONFIG_SMOOTH_HV
 static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
                                              int bw, int bh,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
+  (void)bd;
   const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
   const uint8_t *const sm_weights = sm_weight_arrays + bh;
   // scale = 2^sm_weight_log2_scale
@@ -816,7 +456,7 @@ static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
@@ -826,6 +466,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
                                              int bw, int bh,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
+  (void)bd;
   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
   const uint8_t *const sm_weights = sm_weight_arrays + bw;
   // scale = 2^sm_weight_log2_scale
@@ -846,12 +487,11 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
       for (i = 0; i < 2; ++i) {
         this_pred += weights[i] * pixels[i];
       }
-      dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd);
+      dst[c] = divide_round(this_pred, log2_scale);
     }
     dst += stride;
   }
 }
-#endif  // CONFIG_SMOOTH_HV
 
 static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bw, int bh,
@@ -922,7 +562,148 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
     dst += stride;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
+
+// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
+// assume 2nd shift of 17 bits instead of 16.
+// Note: Strictly speaking, 2nd shift needs to be 17 only when:
+// - bit depth == 12, and
+// - bw + bh is divisible by 5 (as opposed to divisible by 3).
+// All other cases can use half the multipliers with a shift of 16 instead.
+// This special optimization can be used when writing assembly code.
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+// Note: This constant is odd, but a smaller even constant (0x199a) with the
+// appropriate shift should work for neon in 8/10-bit.
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd,
+                                            int shift1, uint32_t multiplier) {
+  int sum = 0;
+  (void)bd;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
+  assert(expected_dc < (1 << bd));
+
+  for (int r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
 
 // This serves as a wrapper function, so that all the prediction functions
 // can be unified and accessed as a pointer array. Note that the boundary
@@ -934,7 +715,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
     type##_predictor(dst, stride, width, height, above, left); \
   }
 
-#if CONFIG_HIGHBITDEPTH
 #define intra_pred_highbd_sized(type, width, height)                        \
   void aom_highbd_##type##_predictor_##width##x##height##_c(                \
       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
@@ -943,7 +723,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   }
 
 /* clang-format off */
-#if CONFIG_TX64X64
 #define intra_pred_rectangular(type) \
   intra_pred_sized(type, 4, 8) \
   intra_pred_sized(type, 8, 4) \
@@ -953,6 +732,12 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   intra_pred_sized(type, 32, 16) \
   intra_pred_sized(type, 32, 64) \
   intra_pred_sized(type, 64, 32) \
+  intra_pred_sized(type, 4, 16) \
+  intra_pred_sized(type, 16, 4) \
+  intra_pred_sized(type, 8, 32) \
+  intra_pred_sized(type, 32, 8) \
+  intra_pred_sized(type, 16, 64) \
+  intra_pred_sized(type, 64, 16) \
   intra_pred_highbd_sized(type, 4, 8) \
   intra_pred_highbd_sized(type, 8, 4) \
   intra_pred_highbd_sized(type, 8, 16) \
@@ -960,7 +745,13 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   intra_pred_highbd_sized(type, 16, 32) \
   intra_pred_highbd_sized(type, 32, 16) \
   intra_pred_highbd_sized(type, 32, 64) \
-  intra_pred_highbd_sized(type, 64, 32)
+  intra_pred_highbd_sized(type, 64, 32) \
+  intra_pred_highbd_sized(type, 4, 16) \
+  intra_pred_highbd_sized(type, 16, 4) \
+  intra_pred_highbd_sized(type, 8, 32) \
+  intra_pred_highbd_sized(type, 32, 8) \
+  intra_pred_highbd_sized(type, 16, 64) \
+  intra_pred_highbd_sized(type, 64, 16)
 #define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8, 8) \
   intra_pred_sized(type, 16, 16) \
@@ -973,100 +764,29 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
   intra_pred_highbd_sized(type, 64, 64) \
   intra_pred_rectangular(type)
 #define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
   intra_pred_sized(type, 4, 4) \
-  intra_pred_highbd_sized(type, 2, 2) \
   intra_pred_above_4x4(type)
-#else  // CONFIG_TX64X64
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16) \
-  intra_pred_highbd_sized(type, 4, 8) \
-  intra_pred_highbd_sized(type, 8, 4) \
-  intra_pred_highbd_sized(type, 8, 16) \
-  intra_pred_highbd_sized(type, 16, 8) \
-  intra_pred_highbd_sized(type, 16, 32) \
-  intra_pred_highbd_sized(type, 32, 16)
-#define intra_pred_above_4x4(type) \
+#define intra_pred_square(type) \
+  intra_pred_sized(type, 4, 4) \
   intra_pred_sized(type, 8, 8) \
   intra_pred_sized(type, 16, 16) \
   intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
   intra_pred_highbd_sized(type, 4, 4) \
   intra_pred_highbd_sized(type, 8, 8) \
   intra_pred_highbd_sized(type, 16, 16) \
   intra_pred_highbd_sized(type, 32, 32) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_highbd_sized(type, 2, 2) \
-  intra_pred_above_4x4(type)
-#endif  // CONFIG_TX64X64
-
-#else
-
-#if CONFIG_TX64X64
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16) \
-  intra_pred_sized(type, 32, 64) \
-  intra_pred_sized(type, 64, 32)
-#define intra_pred_above_4x4(type) \
-  intra_pred_sized(type, 8, 8) \
-  intra_pred_sized(type, 16, 16) \
-  intra_pred_sized(type, 32, 32) \
-  intra_pred_sized(type, 64, 64) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_above_4x4(type)
-#else  // CONFIG_TX64X64
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16)
-#define intra_pred_above_4x4(type) \
-  intra_pred_sized(type, 8, 8) \
-  intra_pred_sized(type, 16, 16) \
-  intra_pred_sized(type, 32, 32) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 2, 2) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_above_4x4(type)
-#endif  // CONFIG_TX64X64
-
-#endif  // CONFIG_HIGHBITDEPTH
+  intra_pred_highbd_sized(type, 64, 64)
 
-intra_pred_allsizes(d207e)
-intra_pred_allsizes(d63e)
-intra_pred_above_4x4(d45e)
-intra_pred_above_4x4(d117)
-intra_pred_above_4x4(d135)
-intra_pred_above_4x4(d153)
 intra_pred_allsizes(v)
 intra_pred_allsizes(h)
 intra_pred_allsizes(smooth)
-#if CONFIG_SMOOTH_HV
 intra_pred_allsizes(smooth_v)
 intra_pred_allsizes(smooth_h)
-#endif  // CONFIG_SMOOTH_HV
 intra_pred_allsizes(paeth)
 intra_pred_allsizes(dc_128)
 intra_pred_allsizes(dc_left)
 intra_pred_allsizes(dc_top)
-intra_pred_allsizes(dc)
+intra_pred_square(dc)
 /* clang-format on */
 #undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
index 96da49b03..e047d98bc 100644
--- a/third_party/aom/aom_dsp/intrapred_common.h
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@@ -12,19 +12,16 @@
 #ifndef _AOM_DSP_INTRAPRED_COMMON_H
 #define _AOM_DSP_INTRAPRED_COMMON_H
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 // Weights are quadratic from '1' to '1 / block_size', scaled by
 // 2^sm_weight_log2_scale.
 static const int sm_weight_log2_scale = 8;
 
-#if CONFIG_TX64X64
 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
 #define MAX_BLOCK_DIM 64
-#else
-#define MAX_BLOCK_DIM 32
-#endif  // CONFIG_TX64X64
 
+/* clang-format off */
 static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
   // Unused, because we always offset by bs, which is at least 2.
   0, 0,
@@ -39,13 +36,12 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
   // bs = 32
   255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
   66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-#if CONFIG_TX64X64
   // bs = 64
   255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
   150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
   65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
   13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-#endif  // CONFIG_TX64X64
 };
+/* clang-format on */
 
 #endif  // _AOM_DSP_INTRAPRED_COMMON_H
diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c
deleted file mode 100644
index 6b7c1c2ab..000000000
--- a/third_party/aom/aom_dsp/inv_txfm.c
+++ /dev/null
@@ -1,1482 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
-#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
-    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
-#include "av1/common/daala_tx.h"
-#endif
-
-void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = WRAPLOW(a1);
-    op[1] = WRAPLOW(b1);
-    op[2] = WRAPLOW(c1);
-    op[3] = WRAPLOW(d1);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
-    ip++;
-    dest++;
-  }
-}
-
-void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = WRAPLOW(a1);
-  op[1] = op[2] = op[3] = WRAPLOW(e1);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
-    ip++;
-    dest++;
-  }
-}
-
-void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3]);
-  output[1] = WRAPLOW(step[1] + step[2]);
-  output[2] = WRAPLOW(step[1] - step[2]);
-  output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    aom_idct4_c(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    aom_idct4_c(temp_in, temp_out);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
-    }
-  }
-}
-
-void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  if (a1 == 0) return;
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_add(dest[0], a1);
-    dest[1] = clip_pixel_add(dest[1], a1);
-    dest[2] = clip_pixel_add(dest[2], a1);
-    dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
-  }
-}
-
-void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    aom_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    aom_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  if (a1 == 0) return;
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = WRAPLOW(x0 - x2 + x3);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
-  output[2] = WRAPLOW(dct_const_round_shift(s2));
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
-  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
-  // stage 2
-  s0 = (int)x0;
-  s1 = (int)x1;
-  s2 = (int)x2;
-  s3 = (int)x3;
-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
-  // stage 3
-  s2 = (int)(cospi_16_64 * (x2 + x3));
-  s3 = (int)(cospi_16_64 * (x2 - x3));
-  s6 = (int)(cospi_16_64 * (x6 + x7));
-  s7 = (int)(cospi_16_64 * (x6 - x7));
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x4);
-  output[2] = WRAPLOW(x6);
-  output[3] = WRAPLOW(-x2);
-  output[4] = WRAPLOW(x3);
-  output[5] = WRAPLOW(-x7);
-  output[6] = WRAPLOW(x5);
-  output[7] = WRAPLOW(-x1);
-}
-
-void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    aom_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    aom_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15]);
-  output[1] = WRAPLOW(step2[1] + step2[14]);
-  output[2] = WRAPLOW(step2[2] + step2[13]);
-  output[3] = WRAPLOW(step2[3] + step2[12]);
-  output[4] = WRAPLOW(step2[4] + step2[11]);
-  output[5] = WRAPLOW(step2[5] + step2[10]);
-  output[6] = WRAPLOW(step2[6] + step2[9]);
-  output[7] = WRAPLOW(step2[7] + step2[8]);
-  output[8] = WRAPLOW(step2[7] - step2[8]);
-  output[9] = WRAPLOW(step2[6] - step2[9]);
-  output[10] = WRAPLOW(step2[5] - step2[10]);
-  output[11] = WRAPLOW(step2[4] - step2[11]);
-  output[12] = WRAPLOW(step2[3] - step2[12]);
-  output[13] = WRAPLOW(step2[2] - step2[13]);
-  output[14] = WRAPLOW(step2[1] - step2[14]);
-  output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4);
-  x1 = WRAPLOW(s1 + s5);
-  x2 = WRAPLOW(s2 + s6);
-  x3 = WRAPLOW(s3 + s7);
-  x4 = WRAPLOW(s0 - s4);
-  x5 = WRAPLOW(s1 - s5);
-  x6 = WRAPLOW(s2 - s6);
-  x7 = WRAPLOW(s3 - s7);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-  x8 = WRAPLOW(s8 + s10);
-  x9 = WRAPLOW(s9 + s11);
-  x10 = WRAPLOW(s8 - s10);
-  x11 = WRAPLOW(s9 - s11);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-  x10 = WRAPLOW(dct_const_round_shift(s10));
-  x11 = WRAPLOW(dct_const_round_shift(s11));
-  x14 = WRAPLOW(dct_const_round_shift(s14));
-  x15 = WRAPLOW(dct_const_round_shift(s15));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x8);
-  output[2] = WRAPLOW(x12);
-  output[3] = WRAPLOW(-x4);
-  output[4] = WRAPLOW(x6);
-  output[5] = WRAPLOW(x14);
-  output[6] = WRAPLOW(x10);
-  output[7] = WRAPLOW(x2);
-  output[8] = WRAPLOW(x3);
-  output[9] = WRAPLOW(x11);
-  output[10] = WRAPLOW(x15);
-  output[11] = WRAPLOW(x7);
-  output[12] = WRAPLOW(x5);
-  output[13] = WRAPLOW(-x13);
-  output[14] = WRAPLOW(x9);
-  output[15] = WRAPLOW(-x1);
-}
-
-void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  int i, j;
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 8x8 area, we only need to calculate first 8 rows here.
-  for (i = 0; i < 8; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  if (a1 == 0) return;
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step2[16] = WRAPLOW(step1[16] + step1[17]);
-  step2[17] = WRAPLOW(step1[16] - step1[17]);
-  step2[18] = WRAPLOW(-step1[18] + step1[19]);
-  step2[19] = WRAPLOW(step1[18] + step1[19]);
-  step2[20] = WRAPLOW(step1[20] + step1[21]);
-  step2[21] = WRAPLOW(step1[20] - step1[21]);
-  step2[22] = WRAPLOW(-step1[22] + step1[23]);
-  step2[23] = WRAPLOW(step1[22] + step1[23]);
-  step2[24] = WRAPLOW(step1[24] + step1[25]);
-  step2[25] = WRAPLOW(step1[24] - step1[25]);
-  step2[26] = WRAPLOW(-step1[26] + step1[27]);
-  step2[27] = WRAPLOW(step1[26] + step1[27]);
-  step2[28] = WRAPLOW(step1[28] + step1[29]);
-  step2[29] = WRAPLOW(step1[28] - step1[29]);
-  step2[30] = WRAPLOW(-step1[30] + step1[31]);
-  step2[31] = WRAPLOW(step1[30] + step1[31]);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = WRAPLOW(step1[16] + step1[19]);
-  step2[17] = WRAPLOW(step1[17] + step1[18]);
-  step2[18] = WRAPLOW(step1[17] - step1[18]);
-  step2[19] = WRAPLOW(step1[16] - step1[19]);
-  step2[20] = WRAPLOW(-step1[20] + step1[23]);
-  step2[21] = WRAPLOW(-step1[21] + step1[22]);
-  step2[22] = WRAPLOW(step1[21] + step1[22]);
-  step2[23] = WRAPLOW(step1[20] + step1[23]);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27]);
-  step2[25] = WRAPLOW(step1[25] + step1[26]);
-  step2[26] = WRAPLOW(step1[25] - step1[26]);
-  step2[27] = WRAPLOW(step1[24] - step1[27]);
-  step2[28] = WRAPLOW(-step1[28] + step1[31]);
-  step2[29] = WRAPLOW(-step1[29] + step1[30]);
-  step2[30] = WRAPLOW(step1[29] + step1[30]);
-  step2[31] = WRAPLOW(step1[28] + step1[31]);
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = WRAPLOW(step1[16] + step1[23]);
-  step2[17] = WRAPLOW(step1[17] + step1[22]);
-  step2[18] = WRAPLOW(step1[18] + step1[21]);
-  step2[19] = WRAPLOW(step1[19] + step1[20]);
-  step2[20] = WRAPLOW(step1[19] - step1[20]);
-  step2[21] = WRAPLOW(step1[18] - step1[21]);
-  step2[22] = WRAPLOW(step1[17] - step1[22]);
-  step2[23] = WRAPLOW(step1[16] - step1[23]);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31]);
-  step2[25] = WRAPLOW(-step1[25] + step1[30]);
-  step2[26] = WRAPLOW(-step1[26] + step1[29]);
-  step2[27] = WRAPLOW(-step1[27] + step1[28]);
-  step2[28] = WRAPLOW(step1[27] + step1[28]);
-  step2[29] = WRAPLOW(step1[26] + step1[29]);
-  step2[30] = WRAPLOW(step1[25] + step1[30]);
-  step2[31] = WRAPLOW(step1[24] + step1[31]);
-
-  // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15]);
-  step1[1] = WRAPLOW(step2[1] + step2[14]);
-  step1[2] = WRAPLOW(step2[2] + step2[13]);
-  step1[3] = WRAPLOW(step2[3] + step2[12]);
-  step1[4] = WRAPLOW(step2[4] + step2[11]);
-  step1[5] = WRAPLOW(step2[5] + step2[10]);
-  step1[6] = WRAPLOW(step2[6] + step2[9]);
-  step1[7] = WRAPLOW(step2[7] + step2[8]);
-  step1[8] = WRAPLOW(step2[7] - step2[8]);
-  step1[9] = WRAPLOW(step2[6] - step2[9]);
-  step1[10] = WRAPLOW(step2[5] - step2[10]);
-  step1[11] = WRAPLOW(step2[4] - step2[11]);
-  step1[12] = WRAPLOW(step2[3] - step2[12]);
-  step1[13] = WRAPLOW(step2[2] - step2[13]);
-  step1[14] = WRAPLOW(step2[1] - step2[14]);
-  step1[15] = WRAPLOW(step2[0] - step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31]);
-  output[1] = WRAPLOW(step1[1] + step1[30]);
-  output[2] = WRAPLOW(step1[2] + step1[29]);
-  output[3] = WRAPLOW(step1[3] + step1[28]);
-  output[4] = WRAPLOW(step1[4] + step1[27]);
-  output[5] = WRAPLOW(step1[5] + step1[26]);
-  output[6] = WRAPLOW(step1[6] + step1[25]);
-  output[7] = WRAPLOW(step1[7] + step1[24]);
-  output[8] = WRAPLOW(step1[8] + step1[23]);
-  output[9] = WRAPLOW(step1[9] + step1[22]);
-  output[10] = WRAPLOW(step1[10] + step1[21]);
-  output[11] = WRAPLOW(step1[11] + step1[20]);
-  output[12] = WRAPLOW(step1[12] + step1[19]);
-  output[13] = WRAPLOW(step1[13] + step1[18]);
-  output[14] = WRAPLOW(step1[14] + step1[17]);
-  output[15] = WRAPLOW(step1[15] + step1[16]);
-  output[16] = WRAPLOW(step1[15] - step1[16]);
-  output[17] = WRAPLOW(step1[14] - step1[17]);
-  output[18] = WRAPLOW(step1[13] - step1[18]);
-  output[19] = WRAPLOW(step1[12] - step1[19]);
-  output[20] = WRAPLOW(step1[11] - step1[20]);
-  output[21] = WRAPLOW(step1[10] - step1[21]);
-  output[22] = WRAPLOW(step1[9] - step1[22]);
-  output[23] = WRAPLOW(step1[8] - step1[23]);
-  output[24] = WRAPLOW(step1[7] - step1[24]);
-  output[25] = WRAPLOW(step1[6] - step1[25]);
-  output[26] = WRAPLOW(step1[5] - step1[26]);
-  output[27] = WRAPLOW(step1[4] - step1[27]);
-  output[28] = WRAPLOW(step1[3] - step1[28]);
-  output[29] = WRAPLOW(step1[2] - step1[29]);
-  output[30] = WRAPLOW(step1[1] - step1[30]);
-  output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-#if CONFIG_MRC_TX
-void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride, uint8_t *mask) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      aom_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      // Only add the coefficient if the mask value is 1
-      int mask_val = mask[j * 32 + i];
-      dest[j * stride + i] =
-          mask_val ? clip_pixel_add(dest[j * stride + i],
-                                    ROUND_POWER_OF_TWO(temp_out[j], 6))
-                   : dest[j * stride + i];
-    }
-  }
-}
-
-void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             uint8_t *mask) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 16x16 has non-zero coeff
-  for (i = 0; i < 16; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      // Only add the coefficient if the mask value is 1
-      int mask_val = mask[j * 32 + i];
-      dest[j * stride + i] =
-          mask_val ? clip_pixel_add(dest[j * stride + i],
-                                    ROUND_POWER_OF_TWO(temp_out[j], 6))
-                   : dest[j * stride + i];
-    }
-  }
-}
-
-void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            uint8_t *mask) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      // Only add the coefficient if the mask value is 1
-      int mask_val = mask[j * 32 + i];
-      dest[j * stride + i] =
-          mask_val ? clip_pixel_add(dest[j * stride + i],
-                                    ROUND_POWER_OF_TWO(temp_out[j], 6))
-                   : dest[j * stride + i];
-    }
-  }
-}
-#endif  // CONFIG_MRC_TX
-
-void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      aom_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 16x16 has non-zero coeff
-  for (i = 0; i < 16; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  if (a1 == 0) return;
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = HIGHBD_WRAPLOW(a1, bd);
-    op[1] = HIGHBD_WRAPLOW(b1, bd);
-    op[2] = HIGHBD_WRAPLOW(c1, bd);
-    op[3] = HIGHBD_WRAPLOW(d1, bd);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] =
-        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
-    dest[stride * 1] =
-        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
-    dest[stride * 2] =
-        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
-    dest[stride * 3] =
-        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
-
-    ip++;
-    dest++;
-  }
-}
-
-void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  (void)bd;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = HIGHBD_WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] =
-        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] =
-        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] =
-        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] =
-        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
-    ip++;
-    dest++;
-  }
-}
diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h
deleted file mode 100644
index 644a6599f..000000000
--- a/third_party/aom/aom_dsp/inv_txfm.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_INV_TXFM_H_
-#define AOM_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
-  return ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-}
-
-static INLINE tran_high_t check_range(tran_high_t input, int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid AV1 input streams, intermediate stage coefficients should always
-  // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt AV1 streams. However, strictly checking
-  // this range for every intermediate coefficient can burdensome for a decoder,
-  // therefore the following assertion is only enabled when configured with
-  // --enable-coefficient-range-checking.
-  // For valid highbitdepth AV1 streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-  const int32_t int_max = (1 << (7 + bd)) - 1;
-  const int32_t int_min = -int_max - 1;
-  assert(int_min <= input);
-  assert(input <= int_max);
-  (void)int_min;
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  (void)bd;
-  return input;
-}
-
-#define WRAPLOW(x) ((int32_t)check_range(x, 8))
-#define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd))
-
-#if CONFIG_MRC_TX
-// These each perform dct but add coefficients based on a mask
-void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride, uint8_t *mask);
-
-void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                             uint8_t *mask);
-
-void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                            uint8_t *mask);
-#endif  // CONFIG_MRC_TX
-
-void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
-void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
-void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
-void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
-#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
-void aom_idct64_c(const tran_low_t *input, tran_low_t *output);
-#endif
-void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
-void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
-void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = HIGHBD_WRAPLOW(trans, bd);
-  return clip_pixel_highbd(dest + (int)trans, bd);
-}
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans);
-  return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_INV_TXFM_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
index 69f131378..a3f261824 100644
--- a/third_party/aom/aom_dsp/loopfilter.c
+++ b/third_party/aom/aom_dsp/loopfilter.c
@@ -11,8 +11,9 @@
 
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
@@ -20,18 +21,6 @@ static INLINE int8_t signed_char_clamp(int t) {
   return (int8_t)clamp(t, -128, 127);
 }
 
-#define PARALLEL_DEBLOCKING_11_TAP 0
-#define PARALLEL_DEBLOCKING_9_TAP 0
-
-#if CONFIG_DEBLOCK_13TAP
-#define PARALLEL_DEBLOCKING_13_TAP 1
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
-#else
-#define PARALLEL_DEBLOCKING_13_TAP 0
-#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
-#endif
-
-#if CONFIG_HIGHBITDEPTH
 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
   switch (bd) {
     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
@@ -40,8 +29,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
     default: return (int16_t)clamp(t, -128, 128 - 1);
   }
 }
-#endif
-#if CONFIG_PARALLEL_DEBLOCKING
+
 // should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
                                   uint8_t p0, uint8_t q0, uint8_t q1) {
@@ -51,7 +39,7 @@ static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
   return ~mask;
 }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                  uint8_t q1, uint8_t q2, uint8_t q3) {
@@ -66,7 +54,18 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
   return ~mask;
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                         uint8_t p2, uint8_t p1, uint8_t p0,
+                                         uint8_t q0, uint8_t q1, uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
 static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
                                        uint8_t p0, uint8_t q0, uint8_t q1,
                                        uint8_t q2) {
@@ -77,7 +76,6 @@ static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
   mask |= (abs(q2 - q0) > thresh) * -1;
   return ~mask;
 }
-#endif
 
 static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
                                 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
@@ -92,39 +90,6 @@ static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
   return ~mask;
 }
 
-#if PARALLEL_DEBLOCKING_9_TAP
-static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0,
-                                uint8_t q0, uint8_t q4) {
-  int8_t mask = 0;
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-#endif
-
-#if PARALLEL_DEBLOCKING_11_TAP
-static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4,
-                                uint8_t p0, uint8_t q0, uint8_t q4,
-                                uint8_t q5) {
-  int8_t mask = 0;
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  mask |= (abs(p5 - p0) > thresh) * -1;
-  mask |= (abs(q5 - q0) > thresh) * -1;
-  return ~mask;
-}
-#endif
-
-static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
-                                uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
-                                uint8_t q1, uint8_t q2, uint8_t q3,
-                                uint8_t q4) {
-  int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
-  mask |= (abs(p4 - p0) > thresh) * -1;
-  mask |= (abs(q4 - q0) > thresh) * -1;
-  return ~mask;
-}
-
 // is there high edge variance internal edge: 11111111 yes, 00000000 no
 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
                               uint8_t q0, uint8_t q1) {
@@ -170,25 +135,14 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
                             const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p];
     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
     ++s;
   }
@@ -199,31 +153,20 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
   aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint8_t p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1];
     const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
     s += pitch;
   }
@@ -234,10 +177,9 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
   aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
@@ -254,7 +196,6 @@ static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
     filter4(mask, thresh, op1, op0, oq0, oq1);
   }
 }
-#endif
 
 static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
@@ -276,40 +217,38 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
   }
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
 
     const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
     const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
     filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
             s + 2 * p);
     ++s;
   }
 }
-#endif
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
+}
 
 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
@@ -331,39 +270,37 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
   aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
     const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
     const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
     filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
     s += pitch;
   }
 }
-#endif
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
 
 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
@@ -382,10 +319,9 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
   aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-#if PARALLEL_DEBLOCKING_13_TAP
 static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
                             int8_t flat2, uint8_t *op6, uint8_t *op5,
                             uint8_t *op4, uint8_t *op3, uint8_t *op2,
@@ -433,186 +369,43 @@ static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
-#endif
-
-#if PARALLEL_DEBLOCKING_11_TAP
-static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op5, uint8_t *op4,
-                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
-                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
-                            uint8_t *oq2, uint8_t *oq3, uint8_t *oq4,
-                            uint8_t *oq5) {
-  if (flat2 && flat && mask) {
-    const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1,
-                  p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
-                  q5 = *oq5;
-
-    // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1]
-    *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12;
-    *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12;
-    *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12;
-    *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12;
-    *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12;
-    *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12;
-    *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12;
-    *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12;
-    *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12;
-    *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12;
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-#endif
-
-#if PARALLEL_DEBLOCKING_9_TAP
-static INLINE void filter10(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op4, uint8_t *op3,
-                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
-                            uint8_t *oq3, uint8_t *oq4) {
-  if (flat2 && flat && mask) {
-    const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4;
-
-    // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1]
-    *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10;
-    *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10;
-    *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10;
-    *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10;
-    *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10;
-    *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10;
-    *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10;
-    *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10;
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-#endif
-
-static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op7, uint8_t *op6,
-                            uint8_t *op5, uint8_t *op4, uint8_t *op3,
-                            uint8_t *op2, uint8_t *op1, uint8_t *op0,
-                            uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
-                            uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
-                            uint8_t *oq6, uint8_t *oq7) {
-  if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
-                  p2 = *op2, p1 = *op1, p0 = *op0;
-
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
-                  q5 = *oq5, q6 = *oq6, q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(
-        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(
-        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                                  q0 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
-                                  q1 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
-                                  q2 + q3 + q4 + q5 + q6 + q7,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
-                                  q3 + q4 + q5 + q6 + q7 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(
-        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
 
 static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int count) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int step = 4;
-#else
-  int step = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < step * count; ++i) {
-    const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
-                  p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
-                  p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
+                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
-                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p];
+                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-
-#if PARALLEL_DEBLOCKING_13_TAP
-    (void)p7;
-    (void)q7;
     const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
 
     filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
              s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
-
-#elif PARALLEL_DEBLOCKING_11_TAP
-    const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
-
-    filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
-             s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p,
-             s + 3 * p, s + 4 * p, s + 5 * p);
-
-#elif PARALLEL_DEBLOCKING_9_TAP
-    const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
-
-    filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p,
-             s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p,
-             s + 4 * p);
-#else
-    const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
-
-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
-             s + 7 * p);
-#endif
-
     ++s;
   }
 }
 
-void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh) {
+void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
 }
 
-void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-#else
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
-#endif
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
 }
 
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
@@ -621,60 +414,34 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   int i;
 
   for (i = 0; i < count; ++i) {
-    const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4],
-                  p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
+                  p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
-                  q5 = s[5], q6 = s[6], q7 = s[7];
+                  q5 = s[5], q6 = s[6];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-
-#if PARALLEL_DEBLOCKING_13_TAP
-    (void)p7;
-    (void)q7;
     const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
 
     filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
              s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
-#elif PARALLEL_DEBLOCKING_11_TAP
-    const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
-
-    filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
-             s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5);
-#elif PARALLEL_DEBLOCKING_9_TAP
-    const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4);
-
-    filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s,
-             s + 1, s + 2, s + 3, s + 4);
-
-#else
-    const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7);
-
-    filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
-             s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
-             s + 7);
-#endif
-
     s += p;
   }
 }
 
-void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
-#else
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
-#endif
 }
 
-void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
 }
 
-#if CONFIG_HIGHBITDEPTH
-#if CONFIG_PARALLEL_DEBLOCKING
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
                                          uint16_t p1, uint16_t p0, uint16_t q0,
@@ -687,7 +454,6 @@ static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
   return ~mask;
 }
-#endif  // CONFIG_PARALLEL_DEBLOCKING
 
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
@@ -707,7 +473,22 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
   return ~mask;
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
+static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                                uint16_t p2, uint16_t p1,
+                                                uint16_t p0, uint16_t q0,
+                                                uint16_t q1, uint16_t q2,
+                                                int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
 static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
                                               uint16_t p1, uint16_t p0,
                                               uint16_t q0, uint16_t q1,
@@ -720,7 +501,6 @@ static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
   mask |= (abs(q2 - q0) > thresh16) * -1;
   return ~mask;
 }
-#endif
 
 static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
                                        uint16_t p1, uint16_t p0, uint16_t q0,
@@ -737,17 +517,6 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
   return ~mask;
 }
 
-static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
-                                       uint16_t p2, uint16_t p1, uint16_t p0,
-                                       uint16_t q0, uint16_t q1, uint16_t q2,
-                                       uint16_t q3, uint16_t q4, int bd) {
-  int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p4 - p0) > thresh16) * -1;
-  mask |= (abs(q4 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
 // Is there high edge variance internal edge:
 // 11111111_11111111 yes, 00000000_00000000 no ?
 static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
@@ -798,34 +567,17 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p1 = s[-2 * p];
     const uint16_t p0 = s[-p];
     const uint16_t q0 = s[0 * p];
     const uint16_t q1 = s[1 * p];
     const int8_t mask =
         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
     ++s;
   }
@@ -836,33 +588,22 @@ void aom_highbd_lpf_horizontal_4_dual_c(
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
 }
 
 void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-#if !CONFIG_PARALLEL_DEBLOCKING
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-#else   // CONFIG_PARALLEL_DEBLOCKING
     const uint16_t p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1];
     const int8_t mask =
         highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
     s += pitch;
   }
@@ -873,11 +614,10 @@ void aom_highbd_lpf_vertical_4_dual_c(
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
                               bd);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
@@ -895,7 +635,6 @@ static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
     highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
   }
 }
-#endif
 
 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
@@ -921,11 +660,7 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
@@ -943,74 +678,75 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
   }
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
 
     const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
     const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
     highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
                    s + 1 * p, s + 2 * p, bd);
     ++s;
   }
 }
-#endif
+
+void aom_highbd_lpf_horizontal_6_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
 
 void aom_highbd_lpf_horizontal_8_dual_c(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
 }
 
-#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
 void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
     const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
     const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
     highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
                    bd);
     s += pitch;
   }
 }
-#endif
+
+void aom_highbd_lpf_vertical_6_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
 
 void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int count = 4;
-#else
-  int count = 8;
-#endif
 
   for (i = 0; i < count; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
@@ -1030,11 +766,10 @@ void aom_highbd_lpf_vertical_8_dual_c(
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
+  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
                               bd);
 }
 
-#if PARALLEL_DEBLOCKING_13_TAP
 static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
                                    int8_t flat2, uint16_t *op6, uint16_t *op5,
                                    uint16_t *op4, uint16_t *op3, uint16_t *op2,
@@ -1094,73 +829,6 @@ static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
                    bd);
   }
 }
-#endif
-
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
-                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
-                                   uint16_t *op5, uint16_t *op4, uint16_t *op3,
-                                   uint16_t *op2, uint16_t *op1, uint16_t *op0,
-                                   uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
-                                   uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
-                                   uint16_t *oq6, uint16_t *oq7, int bd) {
-  if (flat2 && flat && mask) {
-    const uint16_t p7 = *op7;
-    const uint16_t p6 = *op6;
-    const uint16_t p5 = *op5;
-    const uint16_t p4 = *op4;
-    const uint16_t p3 = *op3;
-    const uint16_t p2 = *op2;
-    const uint16_t p1 = *op1;
-    const uint16_t p0 = *op0;
-    const uint16_t q0 = *oq0;
-    const uint16_t q1 = *oq1;
-    const uint16_t q2 = *oq2;
-    const uint16_t q3 = *oq3;
-    const uint16_t q4 = *oq4;
-    const uint16_t q5 = *oq5;
-    const uint16_t q6 = *oq6;
-    const uint16_t q7 = *oq7;
-
-    // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
-    *op6 = ROUND_POWER_OF_TWO(
-        p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
-    *op5 = ROUND_POWER_OF_TWO(
-        p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
-                                  q0 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
-                                  q1 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
-                                  q2 + q3 + q4 + q5 + q6 + q7,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
-                                  q3 + q4 + q5 + q6 + q7 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
-    *oq6 = ROUND_POWER_OF_TWO(
-        p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else {
-    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
-                   bd);
-  }
-}
 
 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                             const uint8_t *blimit,
@@ -1168,11 +836,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                             const uint8_t *thresh, int count,
                                             int bd) {
   int i;
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   int step = 4;
-#else
-  int step = 8;
-#endif
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
@@ -1190,7 +854,6 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
 
-#if PARALLEL_DEBLOCKING_13_TAP
     const int8_t flat2 =
         highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
                           s[5 * p], s[6 * p], bd);
@@ -1198,36 +861,22 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
     highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
                     s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
                     s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
-#else
-    const int8_t flat2 =
-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
-                    s + 6 * p, s + 7 * p, bd);
-#endif
     ++s;
   }
 }
 
-void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
-                                        const uint8_t *blimit,
-                                        const uint8_t *limit,
-                                        const uint8_t *thresh, int bd) {
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
   highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
 }
 
-void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
-                                         const uint8_t *blimit,
-                                         const uint8_t *limit,
-                                         const uint8_t *thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
-#else
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
-#endif
+void aom_highbd_lpf_horizontal_14_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
 }
 
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
@@ -1250,43 +899,27 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-#if PARALLEL_DEBLOCKING_13_TAP
     const int8_t flat2 =
         highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
 
     highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
                     s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
                     s + 6, bd);
-#else
-    const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
-                                           q0, s[4], s[5], s[6], s[7], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
-                    s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
-                    s + 5, s + 6, s + 7, bd);
-#endif
     s += p;
   }
 }
 
-void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
-#else
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
-#endif
 }
 
-void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
-#else
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
-#endif
+void aom_highbd_lpf_vertical_14_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                4, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
index 4c6e201e1..96d04cff0 100644
--- a/third_party/aom/aom_dsp/mips/add_noise_msa.c
+++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c
@@ -10,7 +10,8 @@
  */
 
 #include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "aom_dsp/mips/macros_msa.h"
 
 void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
                              char blackclamp[16], char whiteclamp[16],
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
deleted file mode 100644
index 847394a3d..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, res2, res3;
-  v16u8 mask0, mask1, mask2, mask3;
-  v8i16 filt, res0, res1;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, res0, res1);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  SRARI_H2_SH(res0, res1, FILTER_BITS);
-  SAT_SH2_SH(res0, res1, 7);
-  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  XORI_B2_128_UB(res2, res3);
-  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8i16 filt, vec0, vec1, vec2, vec3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  src += (4 * src_stride);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, vec0, vec1);
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, vec2, vec3);
-  SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
-  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
-              res3);
-  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
-  XORI_B2_128_UB(res0, res2);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
-  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
-  ST4x8_UB(res0, res2, dst, dst_stride);
-}
-
-static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  int32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    src += (4 * src_stride);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
-                            dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  int32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
-  v8i16 filt, out0, out1, out2, out3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height >> 1; loop_cnt--;) {
-    LD_SB2(src, src_stride, src0, src2);
-    LD_SB2(src + 8, src_stride, src1, src3);
-    src += (2 * src_stride);
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
-    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
-    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
-               vec14);
-    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
-               vec15);
-    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
-                vec9, vec10, vec11);
-    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
-                 vec2, vec3);
-    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
-                 vec9, vec10, vec11);
-    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
-                out2, out3);
-    LD_UB2(dst, dst_stride, dst0, dst1);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
-    dst += dst_stride;
-    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
-  v8i16 filt, out0, out1, out2, out3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
-    VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
-    VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
-               vec14);
-    VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
-               vec15);
-    DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
-                vec9, vec10, vec11);
-    DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
-                 vec2, vec3);
-    DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
-                 vec9, vec10, vec11);
-    ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
-                out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    LD_UB2(dst, 16, dst1, dst2);
-    PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
-    PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
-  v8i16 filt, out0, out1, out2, out3;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height; loop_cnt--;) {
-    for (cnt = 0; cnt < 2; ++cnt) {
-      src0 = LD_SB(&src[cnt << 5]);
-      src2 = LD_SB(&src[16 + (cnt << 5)]);
-      src3 = LD_SB(&src[24 + (cnt << 5)]);
-      src1 = __msa_sldi_b(src2, src0, 8);
-
-      XORI_B4_128_SB(src0, src1, src2, src3);
-      VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
-                 vec12);
-      VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
-                 vec13);
-      VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
-                 vec14);
-      VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
-                 vec15);
-      DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
-                  vec1, vec2, vec3);
-      DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
-                  vec9, vec10, vec11);
-      DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
-                   vec1, vec2, vec3);
-      DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
-                   vec9, vec10, vec11);
-      ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
-                  out2, out3);
-      SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-      SAT_SH4_SH(out0, out1, out2, out3, 7);
-      LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
-      PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
-      PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
-    }
-
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
-  v8u16 vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
-  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8u16 vec4, vec5, vec6, vec7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
-              vec6, vec7);
-  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
-              res3);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
-}
-
-static void common_hz_2t_and_aver_dst_8x8mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter, int32_t height) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
-  dst += (4 * dst_stride);
-
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
-  dst += (4 * dst_stride);
-
-  if (16 == height) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
-  }
-}
-
-static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                          filter, height);
-  }
-}
-
-static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src2, src4, src6);
-  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-  src += (4 * src_stride);
-
-  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
-              res2, res3);
-  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
-              res6, res7);
-  SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
-  SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
-  dst += dst_stride;
-  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
-  dst += dst_stride;
-  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
-  dst += dst_stride;
-  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
-  dst += dst_stride;
-
-  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
-                res2, res3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
-                res6, res7);
-    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
-    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
-    dst += dst_stride;
-    PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
-    dst += dst_stride;
-    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
-    dst += dst_stride;
-    PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-    src4 = LD_SB(src);
-    src6 = LD_SB(src + 16);
-    src7 = LD_SB(src + 24);
-    src5 = __msa_sldi_b(src6, src4, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
-                res2, res3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
-                res6, res7);
-    SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
-    SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-    LD_UB2(dst, 16, dst0, dst1);
-    PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
-    PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
-    dst += dst_stride;
-    LD_UB2(dst, 16, dst2, dst3);
-    PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
-    PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = height; loop_cnt--;) {
-    LD_SB4(src, 16, src0, src2, src4, src6);
-    src7 = LD_SB(src + 56);
-    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
-    PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
-    PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
-    PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  int8_t cnt, filt_hor[8];
-
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 8:
-        common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 16:
-        common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 32:
-        common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      case 64:
-        common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_hor[3], h);
-        break;
-      default:
-        aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 8:
-        common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 16:
-        common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 32:
-        common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_hor, h);
-        break;
-      case 64:
-        common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_hor, h);
-        break;
-      default:
-        aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
deleted file mode 100644
index bed600d5b..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
-    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
-    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    SRARI_H2_SH(res0, res1, FILTER_BITS);
-    SAT_SH2_SH(res0, res1, 7);
-    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
-    XORI_B2_128_UB(tmp0, tmp1);
-    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
-    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out5 = hz_out9;
-    vec0 = vec2;
-    vec1 = vec3;
-    vec2 = vec4;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
-  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
-  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
-    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
-                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
-    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
-                            dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out6 = hz_out10;
-    out0 = out2;
-    out1 = out3;
-    out2 = out8;
-    out4 = out6;
-    out5 = out7;
-    out6 = out9;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
-                                          filter_horiz, filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
-                                          filter_horiz, filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 8; multiple8_cnt--;) {
-    common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
-                                          filter_horiz, filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 dst0, dst1, dst2, dst3, res0, res1;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  filt = LD_UH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-  src8 = LD_SB(src);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
-  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
-  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
-  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
-             hz_out3, hz_out5, 8);
-  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
-              tmp1, tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
-              res3);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert);
-  } else if (8 == height) {
-    common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert);
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-  src += (5 * src_stride);
-
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
-  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_SB(src);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert);
-  } else {
-    common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
-        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
-    dst += dst_stride;
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
-                                           filter_horiz, filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  int8_t cnt, filt_hor[8], filt_ver[8];
-
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, &filt_hor[3],
-                                              &filt_ver[3], h);
-        break;
-      case 8:
-        common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, &filt_hor[3],
-                                              &filt_ver[3], h);
-        break;
-      case 16:
-        common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride,
-                                               &filt_hor[3], &filt_ver[3], h);
-        break;
-      case 32:
-        common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride,
-                                               &filt_hor[3], &filt_ver[3], h);
-        break;
-      case 64:
-        common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride,
-                                               &filt_hor[3], &filt_ver[3], h);
-        break;
-      default:
-        aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-  } else {
-    switch (w) {
-      case 4:
-        common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, filt_hor,
-                                              filt_ver, h);
-        break;
-      case 8:
-        common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                              (int32_t)dst_stride, filt_hor,
-                                              filt_ver, h);
-        break;
-      case 16:
-        common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride, filt_hor,
-                                               filt_ver, h);
-        break;
-      case 32:
-        common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride, filt_hor,
-                                               filt_ver, h);
-        break;
-      case 64:
-        common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                               (int32_t)dst_stride, filt_hor,
-                                               filt_ver, h);
-        break;
-      default:
-        aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
deleted file mode 100644
index dae771104..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c
+++ /dev/null
@@ -1,677 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, out;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
-  v16i8 src10998, filt0, filt1, filt2, filt3;
-  v8i16 filt, out10, out32;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
-             src4332, src6554);
-  XORI_B3_128_SB(src2110, src4332, src6554);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
-    XORI_B2_128_SB(src8776, src10998);
-    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
-                                filt1, filt2, filt3);
-    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
-                                filt1, filt2, filt3);
-    SRARI_H2_SH(out10, out32, FILTER_BITS);
-    SAT_SH2_SH(out10, out32, 7);
-    out = PCKEV_XORI128_UB(out10, out32);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
-    out = __msa_aver_u_b(out, dst0);
-
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src2110 = src6554;
-    src4332 = src8776;
-    src6554 = src10998;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
-  v8i16 filt, out0, out1, out2, out3;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
-                               filt2, filt3);
-    out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
-                               filt2, filt3);
-    out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
-                               filt2, filt3);
-    out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                               filt1, filt2, filt3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
-                            dst_stride);
-    dst += (4 * dst_stride);
-
-    src10_r = src54_r;
-    src32_r = src76_r;
-    src54_r = src98_r;
-    src21_r = src65_r;
-    src43_r = src87_r;
-    src65_r = src109_r;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_and_aver_dst_16w_mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter, int32_t height, int32_t width) {
-  const uint8_t *src_tmp;
-  uint8_t *dst_tmp;
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
-  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  for (cnt = (width >> 4); cnt--;) {
-    src_tmp = src;
-    dst_tmp = dst;
-
-    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
-    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-    src_tmp += (7 * src_stride);
-
-    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-               src54_r, src21_r);
-    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
-               src54_l, src21_l);
-    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
-    for (loop_cnt = (height >> 2); loop_cnt--;) {
-      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
-      src_tmp += (4 * src_stride);
-
-      LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
-      XORI_B4_128_SB(src7, src8, src9, src10);
-      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-                 src87_r, src98_r, src109_r);
-      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
-                 src87_l, src98_l, src109_l);
-      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                   filt1, filt2, filt3);
-      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                   filt1, filt2, filt3);
-      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                   filt1, filt2, filt3);
-      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                   filt1, filt2, filt3);
-      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
-                                   filt1, filt2, filt3);
-      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
-                                   filt1, filt2, filt3);
-      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
-                                   filt1, filt2, filt3);
-      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
-                                   filt1, filt2, filt3);
-      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
-      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
-      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
-                  out3_r, tmp0, tmp1, tmp2, tmp3);
-      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
-      AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
-                  dst2, dst3);
-      ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
-      dst_tmp += (4 * dst_stride);
-
-      src10_r = src54_r;
-      src32_r = src76_r;
-      src54_r = src98_r;
-      src21_r = src65_r;
-      src43_r = src87_r;
-      src65_r = src109_r;
-      src10_l = src54_l;
-      src32_l = src76_l;
-      src54_l = src98_l;
-      src21_l = src65_l;
-      src43_l = src87_l;
-      src65_l = src109_l;
-      src6 = src10;
-    }
-
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
-                                         filter, height, 16);
-}
-
-static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
-                                         filter, height, 32);
-}
-
-static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
-                                         filter, height, 64);
-}
-
-static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
-  v16i8 src10_r, src32_r, src21_r, src43_r;
-  v8i16 filt;
-  v8u16 tmp0, tmp1;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-
-  src4 = LD_SB(src);
-  src += src_stride;
-
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
-  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-  out = __msa_aver_u_b(out, dst0);
-
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
-  v16u8 src2110, src4332, src6554, src8776, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-  src8 = LD_SB(src);
-
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
-             dst3);
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
-             src76_r, src87_r);
-  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
-             src76_r, src2110, src4332, src6554, src8776);
-  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
-              tmp0, tmp1, tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
-  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter) {
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
-  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-              tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
-}
-
-static void common_vt_2t_and_aver_dst_8x8mult_msa(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
-    src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
-
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
-               vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
-                       dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src8;
-  }
-}
-
-static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
-                                             int32_t src_stride, uint8_t *dst,
-                                             int32_t dst_stride, int8_t *filter,
-                                             int32_t height) {
-  if (4 == height) {
-    common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                          filter, height);
-  }
-}
-
-static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
-  /* rearranging filter_y */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-    dst += dst_stride;
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
-    dst += dst_stride;
-
-    src0 = src4;
-  }
-}
-
-static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
-
-  /* rearranging filter_y */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_UB2(src, 16, src0, src5);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
-    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
-    LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
-    src += (4 * src_stride);
-
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
-
-    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
-    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
-    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src4;
-    src5 = src9;
-  }
-}
-
-static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5;
-  v16u8 src6, src7, src8, src9, src10, src11, filt0;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8u16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_UB4(src, 16, src0, src3, src6, src9);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LD_UB2(src, src_stride, src1, src2);
-    LD_UB2(dst, dst_stride, dst0, dst1);
-    LD_UB2(src + 16, src_stride, src4, src5);
-    LD_UB2(dst + 16, dst_stride, dst2, dst3);
-    LD_UB2(src + 32, src_stride, src7, src8);
-    LD_UB2(dst + 32, dst_stride, dst4, dst5);
-    LD_UB2(src + 48, src_stride, src10, src11);
-    LD_UB2(dst + 48, dst_stride, dst6, dst7);
-    src += (2 * src_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
-
-    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
-    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
-    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
-
-    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
-    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
-    dst += (2 * dst_stride);
-
-    src0 = src2;
-    src3 = src5;
-    src6 = src8;
-    src9 = src11;
-  }
-}
-
-void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
-                                int h) {
-  int8_t cnt, filt_ver[8];
-
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 8:
-        common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 16:
-        common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 32:
-        common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      case 64:
-        common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, &filt_ver[3], h);
-        break;
-      default:
-        aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_ver, h);
-        break;
-      case 8:
-        common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
-                                         (int32_t)dst_stride, filt_ver, h);
-        break;
-      case 16:
-        common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_ver, h);
-
-        break;
-      case 32:
-        common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_ver, h);
-        break;
-      case 64:
-        common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
-                                          (int32_t)dst_stride, filt_ver, h);
-        break;
-      default:
-        aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
index fc3a823c5..363fad308 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
@@ -10,7 +10,9 @@
  */
 
 #include <assert.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/aom_convolve_msa.h"
 
 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
deleted file mode 100644
index a4d594931..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c
+++ /dev/null
@@ -1,630 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-const uint8_t mc_filt_mask_arr[16 * 3] = {
-  /* 8 width cases */
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  /* 4 width cases */
-  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
-  /* 4 width cases */
-  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
-};
-
-static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
-  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
-    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
-    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
-    SAT_SH2_SH(tmp0, tmp1, 7);
-    out = PCKEV_XORI128_UB(tmp0, tmp1);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out5 = hz_out9;
-    out0 = out2;
-    out1 = out3;
-    out2 = out4;
-  }
-}
-
-static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
-  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
-  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= (3 + 3 * src_stride);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
-                            filt_hz1, filt_hz2, filt_hz3);
-
-  filt = LD_SH(filter_vert);
-  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
-
-  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
-  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
-  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    XORI_B4_128_SB(src7, src8, src9, src10);
-
-    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
-    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
-                              filt_hz1, filt_hz2, filt_hz3);
-    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
-    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-
-    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
-                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
-    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
-    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
-                               filt_vt2, filt_vt3);
-    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
-    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
-    ST8x4_UB(vec0, vec1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out6 = hz_out10;
-    out0 = out2;
-    out1 = out3;
-    out2 = out8;
-    out4 = out6;
-    out5 = out7;
-    out6 = out9;
-  }
-}
-
-static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                             filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                             filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 8; multiple8_cnt--;) {
-    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                             filter_vert, height);
-    src += 8;
-    dst += 8;
-  }
-}
-
-static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz,
-                                      int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  filt = LD_UH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz,
-                                      int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16i8 res0, res1, res2, res3;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
-  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  filt = LD_UH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-  src8 = LD_SB(src);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
-  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
-  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
-  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
-             hz_out3, hz_out5, 8);
-  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
-
-  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
-              vec5, vec6, vec7);
-  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert);
-  } else if (8 == height) {
-    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert);
-  }
-}
-
-static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz,
-                                      int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
-
-  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
-
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          int8_t *filter_horiz,
-                                          int8_t *filter_vert, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0;
-  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_SB(src);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LD_SB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    LD_SB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
-    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
-    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz, int8_t *filter_vert,
-                                     int32_t height) {
-  if (4 == height) {
-    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert);
-  } else {
-    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
-                                  filter_horiz, filter_vert, height);
-  }
-}
-
-static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
-  v8i16 filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_SH(filter_horiz);
-  filt_hz = (v16u8)__msa_splati_h(filt, 0);
-
-  filt = LD_SH(filter_vert);
-  filt_vt = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
-    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    PCKEV_ST_SB(tmp1, tmp2, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 2; multiple8_cnt--;) {
-    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter_horiz, int8_t *filter_vert,
-                                      int32_t height) {
-  int32_t multiple8_cnt;
-  for (multiple8_cnt = 4; multiple8_cnt--;) {
-    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
-                              filter_vert, height);
-    src += 16;
-    dst += 16;
-  }
-}
-
-void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int32_t x_step_q4, const int16_t *filter_y,
-                       int32_t y_step_q4, int32_t w, int32_t h) {
-  int8_t cnt, filt_hor[8], filt_ver[8];
-
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, &filt_hor[3],
-                                 &filt_ver[3], (int32_t)h);
-        break;
-      case 8:
-        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, &filt_hor[3],
-                                 &filt_ver[3], (int32_t)h);
-        break;
-      case 16:
-        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, &filt_hor[3],
-                                  &filt_ver[3], (int32_t)h);
-        break;
-      case 32:
-        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, &filt_hor[3],
-                                  &filt_ver[3], (int32_t)h);
-        break;
-      case 64:
-        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, &filt_hor[3],
-                                  &filt_ver[3], (int32_t)h);
-        break;
-      default:
-        aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                    filter_y, y_step_q4, w, h);
-  } else {
-    switch (w) {
-      case 4:
-        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filt_hor, filt_ver,
-                                 (int32_t)h);
-        break;
-      case 8:
-        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filt_hor, filt_ver,
-                                 (int32_t)h);
-        break;
-      case 16:
-        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, filt_hor, filt_ver,
-                                  (int32_t)h);
-        break;
-      case 32:
-        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, filt_hor, filt_ver,
-                                  (int32_t)h);
-        break;
-      case 64:
-        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst,
-                                  (int32_t)dst_stride, filt_hor, filt_ver,
-                                  (int32_t)h);
-        break;
-      default:
-        aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
index f7bdfc2bd..aa962b41f 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
@@ -10,7 +10,9 @@
  */
 
 #include <assert.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/aom_convolve_msa.h"
 
 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
deleted file mode 100644
index 75f8c7ea8..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
-                           int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint32_t out0, out1, out2, out3;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-
-  if (0 == (height % 4)) {
-    for (cnt = (height / 4); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-
-      LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                  dst2, dst3);
-
-      out0 = __msa_copy_u_w((v4i32)dst0, 0);
-      out1 = __msa_copy_u_w((v4i32)dst1, 0);
-      out2 = __msa_copy_u_w((v4i32)dst2, 0);
-      out3 = __msa_copy_u_w((v4i32)dst3, 0);
-      SW4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == (height % 2)) {
-    for (cnt = (height / 2); cnt--;) {
-      LD_UB2(src, src_stride, src0, src1);
-      src += (2 * src_stride);
-
-      LD_UB2(dst, dst_stride, dst0, dst1);
-
-      AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
-
-      out0 = __msa_copy_u_w((v4i32)dst0, 0);
-      out1 = __msa_copy_u_w((v4i32)dst1, 0);
-      SW(out0, dst);
-      dst += dst_stride;
-      SW(out1, dst);
-      dst += dst_stride;
-    }
-  }
-}
-
-static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
-                           int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint64_t out0, out1, out2, out3;
-  v16u8 src0, src1, src2, src3;
-  v16u8 dst0, dst1, dst2, dst3;
-
-  for (cnt = (height / 4); cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-
-    out0 = __msa_copy_u_d((v2i64)dst0, 0);
-    out1 = __msa_copy_u_d((v2i64)dst1, 0);
-    out2 = __msa_copy_u_d((v2i64)dst2, 0);
-    out3 = __msa_copy_u_d((v2i64)dst3, 0);
-    SD4(out0, out1, out2, out3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-
-  for (cnt = (height / 8); cnt--;) {
-    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-    src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
-                dst6, dst7);
-    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
-    dst += (8 * dst_stride);
-  }
-}
-
-static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint8_t *dst_dup = dst;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
-  for (cnt = (height / 8); cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
-    LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
-    dst_dup += (4 * dst_stride);
-    LD_UB4(src, src_stride, src8, src10, src12, src14);
-    LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
-    src += (4 * src_stride);
-    LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
-    LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
-    dst_dup += (4 * dst_stride);
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
-                dst6, dst7);
-    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
-                dst10, dst11);
-    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
-                dst13, dst14, dst15);
-
-    ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
-    ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
-    dst += (4 * dst_stride);
-    ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
-    ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint8_t *dst_dup = dst;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-
-  for (cnt = (height / 4); cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(src, 16, src4, src5, src6, src7);
-    src += src_stride;
-    LD_UB4(src, 16, src8, src9, src10, src11);
-    src += src_stride;
-    LD_UB4(src, 16, src12, src13, src14, src15);
-    src += src_stride;
-
-    LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
-    dst_dup += dst_stride;
-    LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
-    dst_dup += dst_stride;
-    LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
-    dst_dup += dst_stride;
-    LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
-    dst_dup += dst_stride;
-
-    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
-                dst2, dst3);
-    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
-                dst6, dst7);
-    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9,
-                dst10, dst11);
-    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12,
-                dst13, dst14, dst15);
-
-    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
-    dst += dst_stride;
-    ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
-    dst += dst_stride;
-    ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
-    dst += dst_stride;
-    ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
-                          int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
-  switch (w) {
-    case 4: {
-      avg_width4_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 8: {
-      avg_width8_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 16: {
-      avg_width16_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 32: {
-      avg_width32_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 64: {
-      avg_width64_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    default: {
-      int32_t lp, cnt;
-      for (cnt = h; cnt--;) {
-        for (lp = 0; lp < w; ++lp) {
-          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
-        }
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
index 1a0ae4d8d..a0627c074 100644
--- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
@@ -31,23 +31,6 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
     tmp_dpadd_0;                                                           \
   })
 
-#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0,       \
-                        filt_h1, filt_h2, filt_h3)                             \
-  ({                                                                           \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
-    v8i16 hz_out_m;                                                            \
-                                                                               \
-    VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \
-               vec3_m);                                                        \
-    hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0,    \
-                                   filt_h1, filt_h2, filt_h3);                 \
-                                                                               \
-    hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                           \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                     \
-                                                                               \
-    hz_out_m;                                                                  \
-  })
-
 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
                                    mask2, mask3, filt0, filt1, filt2, filt3, \
                                    out0, out1)                               \
@@ -93,32 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
                 res7_m, out0, out1, out2, out3);                             \
   }
 
-#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
-  {                                                  \
-    v16u8 tmp_m;                                     \
-                                                     \
-    tmp_m = PCKEV_XORI128_UB(in1, in0);              \
-    tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);       \
-    ST_UB(tmp_m, (pdst));                            \
-  }
-
-#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)              \
-  {                                                       \
-    v16u8 tmp_m;                                          \
-                                                          \
-    tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
-    tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
-    ST_UB(tmp_m, (pdst));                                 \
-  }
-
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
-                           stride)                                           \
-  {                                                                          \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-                                                                             \
-    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
-    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                  \
-  }
 #endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
index 31159fdcd..d51bfa899 100644
--- a/third_party/aom/aom_dsp/mips/common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.h
@@ -13,7 +13,9 @@
 #define AOM_COMMON_MIPS_DSPR2_H_
 
 #include <assert.h>
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
deleted file mode 100644
index d557115b9..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                         uint8_t *dst, int32_t dst_stride,
-                                         const int16_t *filter_y, int32_t w,
-                                         int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-    prefetch_store(dst + dst_stride + 32);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  uint32_t pos = 38;
-
-  assert(y_step_q4 == 16);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-    case 8:
-    case 16:
-    case 32:
-      convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
-                                   w, h);
-      break;
-    case 64:
-      prefetch_store(dst + 32);
-      convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
-                                    h);
-      break;
-    default:
-      aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
deleted file mode 100644
index efbdcf60f..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3;
-  uint32_t tn1, tn2;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
-        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
-        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp4],       $ac2,           31             \n\t"
-
-        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
-        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
-
-        /* clamp */
-        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
-        "lbux             %[p3],          %[Temp4](%[cm])                \n\t" /* odd 2 */
-        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
-
-        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
-        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
-
-        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t" /* average odd 2 */
-        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=&r"(Temp4)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
-                                          int32_t src_stride, uint8_t *dst,
-                                          int32_t dst_stride,
-                                          const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3, tp4;
-  uint32_t p1, p2, p3, p4, n1;
-  uint32_t st0, st1;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "ulw              %[tp3],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-        "lbu              %[Temp2],       0(%[dst])                      \n\t"
-        "lbu              %[tp4],         2(%[dst])                      \n\t"
-
-        /* even 2. pixel */
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac1,           31             \n\t"
-
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
-        "sb               %[Temp2],       0(%[dst])                      \n\t"
-        "sb               %[tp4],         2(%[dst])                      \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-
-        "balign           %[tp3],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "lbu              %[Temp2],       4(%[dst])                      \n\t"
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "sb               %[Temp2],       4(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tp1],         6(%[dst])                      \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],       $ac1,           31             \n\t"
-
-        "lbu              %[tp2],         1(%[dst])                      \n\t"
-        "lbu              %[tp3],         3(%[dst])                      \n\t"
-        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
-        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tp4],         5(%[dst])                      \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp2],         1(%[dst])                      \n\t"
-        "sb               %[tp1],         6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac2,           31             \n\t"
-
-        "lbu              %[tp1],         7(%[dst])                      \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
-        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
-
-        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
-        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
-
-        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
-        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
-
-        /* store bytes */
-        "sb               %[tp3],         3(%[dst])                      \n\t"
-        "sb               %[tp4],         5(%[dst])                      \n\t"
-        "sb               %[tp1],         7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-          [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1),
-          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
-                                           int32_t src_stride, uint8_t *dst_ptr,
-                                           int32_t dst_stride,
-                                           const int16_t *filter_x0, int32_t h,
-                                           int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
-                                           int32_t src_stride, uint8_t *dst_ptr,
-                                           int32_t dst_stride,
-                                           const int16_t *filter_x0,
-                                           int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  uint32_t pos = 38;
-
-  assert(x_step_q4 == 16);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-      convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h);
-      break;
-    case 8:
-      convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h);
-      break;
-    case 16:
-      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                     h, 1);
-      break;
-    case 32:
-      convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                     h, 2);
-      break;
-    case 64:
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                     h);
-      break;
-    default:
-      aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
index 066308315..08bf1ab30 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
index dc51ab1cb..2a8f75938 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
index 3367be01a..ac87936da 100644
--- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
deleted file mode 100644
index 3574da19f..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_y, int32_t w,
-                                      int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-    prefetch_store(dst + dst_stride + 32);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
-          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-      case 8:
-      case 16:
-      case 32:
-        convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
-                                  h);
-        break;
-      case 64:
-        prefetch_store(dst + 32);
-        convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
-                                   h);
-        break;
-      default:
-        aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
-
-void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
-  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-
-  if (intermediate_height < h) intermediate_height = h;
-
-  aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
-
-  aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                         x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
-                            int h) {
-  int x, y;
-  uint32_t tp1, tp2, tn1;
-  uint32_t tp3, tp4, tn2;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-      /* 1 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-
-            : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 8:
-      /* 2 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 16:
-      /* 4 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         8(%[src])      \n\t"
-            "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-            "ulw              %[tp3],         12(%[src])     \n\t"
-            "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 32:
-      /* 8 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         8(%[src])      \n\t"
-            "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-            "ulw              %[tp3],         12(%[src])     \n\t"
-            "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         16(%[src])     \n\t"
-            "ulw              %[tp2],         16(%[dst])     \n\t"
-            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         20(%[src])     \n\t"
-            "ulw              %[tp4],         20(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         24(%[src])     \n\t"
-            "ulw              %[tp2],         24(%[dst])     \n\t"
-            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         28(%[src])     \n\t"
-            "ulw              %[tp4],         28(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    case 64:
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      /* 16 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_load(src + src_stride + 64);
-        prefetch_store(dst + dst_stride);
-        prefetch_store(dst + dst_stride + 32);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         0(%[dst])      \n\t"
-            "ulw              %[tp3],         4(%[src])      \n\t"
-            "ulw              %[tp4],         4(%[dst])      \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         8(%[src])      \n\t"
-            "ulw              %[tp2],         8(%[dst])      \n\t"
-            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
-            "ulw              %[tp3],         12(%[src])     \n\t"
-            "ulw              %[tp4],         12(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         16(%[src])     \n\t"
-            "ulw              %[tp2],         16(%[dst])     \n\t"
-            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         20(%[src])     \n\t"
-            "ulw              %[tp4],         20(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         24(%[src])     \n\t"
-            "ulw              %[tp2],         24(%[dst])     \n\t"
-            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         28(%[src])     \n\t"
-            "ulw              %[tp4],         28(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         32(%[src])     \n\t"
-            "ulw              %[tp2],         32(%[dst])     \n\t"
-            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         36(%[src])     \n\t"
-            "ulw              %[tp4],         36(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         40(%[src])     \n\t"
-            "ulw              %[tp2],         40(%[dst])     \n\t"
-            "sw               %[tn1],         32(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         36(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         44(%[src])     \n\t"
-            "ulw              %[tp4],         44(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         48(%[src])     \n\t"
-            "ulw              %[tp2],         48(%[dst])     \n\t"
-            "sw               %[tn1],         40(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         44(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         52(%[src])     \n\t"
-            "ulw              %[tp4],         52(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "ulw              %[tp1],         56(%[src])     \n\t"
-            "ulw              %[tp2],         56(%[dst])     \n\t"
-            "sw               %[tn1],         48(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         52(%[dst])     \n\t" /* store */
-            "ulw              %[tp3],         60(%[src])     \n\t"
-            "ulw              %[tp4],         60(%[dst])     \n\t"
-            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
-            "sw               %[tn1],         56(%[dst])     \n\t" /* store */
-            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
-            "sw               %[tn2],         60(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    default:
-      for (y = h; y > 0; --y) {
-        for (x = 0; x < w; ++x) {
-          dst[x] = (dst[x] + src[x] + 1) >> 1;
-        }
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
deleted file mode 100644
index f6534b420..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ /dev/null
@@ -1,998 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4;
-  uint32_t n1, n2, n3, n4;
-  uint32_t tn1, tn2;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "ulw              %[tn2],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
-        "balign           %[tn1],         %[tn2],         3              \n\t"
-        "balign           %[tn2],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
-        "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
-        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
-        "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp4],       $ac2,           31             \n\t"
-
-        "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
-        "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
-
-        /* clamp */
-        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
-        "lbux             %[n2],          %[Temp4](%[cm])                \n\t" /* odd 2 */
-        "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
-
-        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
-        "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
-
-        "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t" /* average odd 2 */
-        "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
-          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
-                                       uint8_t *dst, int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4, n1;
-  uint32_t tn1, tn2, tn3;
-  uint32_t st0, st1;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "ulw              %[tn2],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-        "lbu              %[Temp2],       0(%[dst])                      \n\t"
-        "lbu              %[tn3],         2(%[dst])                      \n\t"
-
-        /* even 2. pixel */
-        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
-        "ulw              %[tn1],         12(%[src])                     \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
-        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac1,           31             \n\t"
-
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-        "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
-        "sb               %[Temp2],       0(%[dst])                      \n\t"
-        "sb               %[tn3],         2(%[dst])                      \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-
-        "balign           %[tn3],         %[tn1],         3              \n\t"
-        "balign           %[tn1],         %[tn2],         3              \n\t"
-        "balign           %[tn2],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-
-        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
-        "lbu              %[Temp2],       4(%[dst])                      \n\t"
-        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                           \n\t"
-        "mthi             $zero,          $ac1                           \n\t"
-        "sb               %[Temp2],       4(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tp1],         6(%[dst])                      \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
-        "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
-        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac1,           31             \n\t"
-
-        "lbu              %[tp2],         1(%[dst])                      \n\t"
-        "lbu              %[tn2],         3(%[dst])                      \n\t"
-        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
-        "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
-        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        "lbu              %[tn3],         5(%[dst])                      \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp2],         1(%[dst])                      \n\t"
-        "sb               %[tp1],         6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac2,           31             \n\t"
-
-        "lbu              %[tn1],         7(%[dst])                      \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
-        "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
-
-        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
-        "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
-
-        "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
-        "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
-
-        /* store bytes */
-        "sb               %[tn2],         3(%[dst])                      \n\t"
-        "sb               %[tn3],         5(%[dst])                      \n\t"
-        "sb               %[tn1],         7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
-          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
-                                        int32_t src_stride, uint8_t *dst_ptr,
-                                        int32_t dst_stride,
-                                        const int16_t *filter_x0, int32_t h,
-                                        int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
-                                        int32_t src_stride, uint8_t *dst_ptr,
-                                        int32_t dst_stride,
-                                        const int16_t *filter_x0, int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
-          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                   \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
-          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
-          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
-
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
-
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
-
-          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
-          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
-          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
-            [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
-            [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-            [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    src -= 3;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    /* prefetch data to cache memory */
-    prefetch_load(src);
-    prefetch_load(src + 32);
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-        convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                   h);
-        break;
-      case 8:
-        convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                   h);
-        break;
-      case 16:
-        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h, 1);
-        break;
-      case 32:
-        convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h, 2);
-        break;
-      case 64:
-        prefetch_load(src + 64);
-        prefetch_store(dst + 32);
-
-        convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                    h);
-        break;
-      default:
-        aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  h);
-        break;
-    }
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
index dd4bc821a..af54b4264 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
@@ -12,1389 +12,14 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
 
 #if HAVE_DSPR2
-static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              const int16_t *filter_x0,
-                                              int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4;
-  uint32_t tn1, tn2;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    dst_ptr = dst;
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "ulw              %[tn2],         8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
-        "balign           %[tn1],         %[tn2],         3              \n\t"
-        "balign           %[tn2],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],       $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp4],       $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
-        "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
-          [dst_stride] "r"(dst_stride));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
-                                              int32_t src_stride, uint8_t *dst,
-                                              int32_t dst_stride,
-                                              const int16_t *filter_x0,
-                                              int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  uint32_t vector4a = 64;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3;
-  uint32_t p1, p2, p3, p4, n1;
-  uint8_t *odd_dst;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    dst_ptr = dst;
-    odd_dst = (dst_ptr + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp2],         0(%[src])                       \n\t"
-        "ulw              %[tp1],         4(%[src])                       \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
-        "ulw              %[tp3],         8(%[src])                       \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
-        "extp             %[Temp1],       $ac3,           31              \n\t"
-
-        /* even 2. pixel */
-        "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
-        "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
-        "ulw              %[tp2],         12(%[src])                      \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
-        "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
-        "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
-        "extp             %[p3],          $ac1,           31              \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-
-        "ulw              %[tp1],         1(%[src])                       \n\t"
-        "ulw              %[tp3],         5(%[src])                       \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
-        "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "ulw              %[tp2],         9(%[src])                       \n\t"
-
-        "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
-        "ulw              %[Temp1],       13(%[src])                      \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
-        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
-        "extp             %[Temp3],       $ac1,           31              \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
-        "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
-        "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
-        "extp             %[Temp1],       $ac2,           31              \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
-        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
-        "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
-
-        /* store bytes */
-        "sb               %[p4],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[p2],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[n1],          0(%[odd_dst])                   \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
-          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
-          [dst_pitch_2] "r"(dst_pitch_2));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_horiz_16_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        16(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* even 8 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        17(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
-            [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-
-    dst_ptr += 1;
-  }
-}
-
-static void convolve_horiz_64_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
-          "\n\t" /* even 1 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
-          "\n\t" /* even 3 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        16(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
-          "\n\t" /* even 4 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
-          "\n\t" /* even 5 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* even 6 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* even 7 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* even 8 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* even 8 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        17(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
-          "\n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
-          "\n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p1],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
-          "\n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
-          "\n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
-          "\n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
-            [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-
-    dst_ptr += 1;
-  }
-}
-
-void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter, int w, int h) {
-  int x, y, k;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int sum = 0;
-
-      for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
-
-      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-    }
-
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x * dst_stride] = src[x];
-    }
-
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
-  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
-  uint32_t pos = 38;
-
-  (void)x_step_q4;
-
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  if (intermediate_height < h) intermediate_height = h;
-
-  /* copy the src to dst */
-  if (filter_x[3] == 0x80) {
-    copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
-                          intermediate_height, w, intermediate_height);
-  } else if (((const int32_t *)filter_x)[0] == 0) {
-    aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
-                        intermediate_height, filter_x, w, intermediate_height);
-  } else {
-    src -= (src_stride * 3 + 3);
-
-    /* prefetch data to cache memory */
-    prefetch_load(src);
-    prefetch_load(src + 32);
-
-    switch (w) {
-      case 4:
-        convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
-                                          intermediate_height, filter_x,
-                                          intermediate_height);
-        break;
-      case 8:
-        convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
-                                          intermediate_height, filter_x,
-                                          intermediate_height);
-        break;
-      case 16:
-      case 32:
-        convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
-                                           intermediate_height, filter_x,
-                                           intermediate_height, (w / 16));
-        break;
-      case 64:
-        prefetch_load(src + 32);
-        convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
-                                           intermediate_height, filter_x,
-                                           intermediate_height);
-        break;
-      default:
-        convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
-                                  filter_x, w, intermediate_height);
-        break;
-    }
-  }
-
-  /* copy the src to dst */
-  if (filter_y[3] == 0x80) {
-    copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
-  } else if (((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
-                        filter_y, h, w);
-  } else {
-    switch (h) {
-      case 4:
-        convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
-                                          dst_stride, filter_y, w);
-        break;
-      case 8:
-        convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
-                                          dst_stride, filter_y, w);
-        break;
-      case 16:
-      case 32:
-        convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
-                                           dst_stride, filter_y, w, (h / 16));
-        break;
-      case 64:
-        convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
-                                           dst_stride, filter_y, w);
-        break;
-      default:
-        convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
-                                  filter_y, h, w);
-        break;
-    }
-  }
-}
-
 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int filter_x_stride,
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
index c60557617..f9c6879ab 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
index d8a90b6ab..201e66427 100644
--- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <stdio.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/convolve_common_dspr2.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
index f8fd9e2b6..e7b8d531b 100644
--- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 
@@ -29,18 +30,6 @@ void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_y, int y_step_q4, int w,
                                int h);
 
-void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h);
-
-void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h);
-
 void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter, int w,
                          int h);
diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
deleted file mode 100644
index 43dce8ba6..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c
+++ /dev/null
@@ -1,928 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
-                                              int32_t src_stride,
-                                              int16_t *temp_buff) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 step0, step1, step2, step3;
-  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
-  v8i16 step0_1, step1_1, step2_1, step3_1;
-
-  /* 1st and 2nd set */
-  LD_SH4(input, src_stride, in0, in1, in2, in3);
-  LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
-  LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
-  LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
-  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
-              step3, in4, in5, in6, in7);
-  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
-              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH4(step0, step1, step2, step3, temp_buff, 8);
-  ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
-  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
-  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
-
-  /* 3rd and 4th set */
-  LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
-  LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
-  LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
-  LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
-  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
-  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
-              step3, in4, in5, in6, in7);
-  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1,
-              step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
-  ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
-  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
-  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
-}
-
-static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8i16 temp0, temp1;
-
-  /* fdct even */
-  LD_SH4(input, 8, in0, in1, in2, in3);
-  LD_SH4(input + 96, 8, in12, in13, in14, in15);
-  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2,
-              vec3, in12, in13, in14, in15);
-  LD_SH4(input + 32, 8, in4, in5, in6, in7);
-  LD_SH4(input + 64, 8, in8, in9, in10, in11);
-  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7,
-              in8, in9, in10, in11);
-
-  /* Stage 3 */
-  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
-  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp);
-  ST_SH(temp1, temp + 512);
-
-  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 256);
-  ST_SH(temp1, temp + 768);
-
-  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 128);
-  ST_SH(temp1, temp + 896);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 640);
-  ST_SH(temp1, temp + 384);
-
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 64);
-  ST_SH(temp1, temp + 960);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 576);
-  ST_SH(temp1, temp + 448);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 320);
-  ST_SH(temp1, temp + 704);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
-  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
-  ST_SH(temp0, temp + 192);
-  ST_SH(temp1, temp + 832);
-}
-
-static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
-  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
-  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
-
-  in20 = LD_SH(input + 32);
-  in21 = LD_SH(input + 40);
-  in26 = LD_SH(input + 80);
-  in27 = LD_SH(input + 88);
-
-  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
-  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
-  in18 = LD_SH(input + 16);
-  in19 = LD_SH(input + 24);
-  in28 = LD_SH(input + 96);
-  in29 = LD_SH(input + 104);
-
-  vec4 = in19 - in20;
-  ST_SH(vec4, input + 32);
-  vec4 = in18 - in21;
-  ST_SH(vec4, input + 40);
-  vec4 = in29 - in26;
-  ST_SH(vec4, input + 80);
-  vec4 = in28 - in27;
-  ST_SH(vec4, input + 88);
-
-  in21 = in18 + in21;
-  in20 = in19 + in20;
-  in27 = in28 + in27;
-  in26 = in29 + in26;
-
-  LD_SH4(input + 48, 8, in22, in23, in24, in25);
-  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
-  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-
-  in16 = LD_SH(input);
-  in17 = LD_SH(input + 8);
-  in30 = LD_SH(input + 112);
-  in31 = LD_SH(input + 120);
-
-  vec4 = in17 - in22;
-  ST_SH(vec4, input + 16);
-  vec4 = in16 - in23;
-  ST_SH(vec4, input + 24);
-  vec4 = in31 - in24;
-  ST_SH(vec4, input + 96);
-  vec4 = in30 - in25;
-  ST_SH(vec4, input + 104);
-
-  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
-  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
-  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
-  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
-  ADD2(in27, in26, in25, in24, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr);
-  ST_SH(vec4, temp_ptr + 960);
-
-  SUB2(in27, in26, in25, in24, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 448);
-  ST_SH(vec4, temp_ptr + 512);
-
-  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
-  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
-  SUB2(in26, in27, in24, in25, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec4, temp_ptr + 704);
-  ST_SH(vec5, temp_ptr + 256);
-
-  ADD2(in26, in27, in24, in25, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec4, temp_ptr + 192);
-  ST_SH(vec5, temp_ptr + 768);
-
-  LD_SH4(input + 16, 8, in22, in23, in20, in21);
-  LD_SH4(input + 80, 8, in26, in27, in24, in25);
-  in16 = in20;
-  in17 = in21;
-  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
-  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
-  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
-  ADD2(in28, in29, in31, in30, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 832);
-  ST_SH(vec4, temp_ptr + 128);
-
-  SUB2(in28, in29, in31, in30, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 320);
-  ST_SH(vec4, temp_ptr + 640);
-  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
-  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
-  SUB2(in29, in28, in30, in31, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 576);
-  ST_SH(vec4, temp_ptr + 384);
-
-  ADD2(in29, in28, in30, in31, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
-  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
-  ST_SH(vec5, temp_ptr + 64);
-  ST_SH(vec4, temp_ptr + 896);
-}
-
-static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
-                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
-  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
-  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
-  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
-}
-
-static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
-                                           int16_t *output) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
-
-  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
-               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
-
-  /* 2nd set */
-  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, step0, step1, step2, step3, step4, step5,
-               step6, step7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
-         (output + 8 * 8), 8);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
-}
-
-static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
-                                    int16_t *out) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
-  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
-  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
-
-  /* fdct32 even */
-  /* stage 2 */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
-               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
-
-  /* Stage 3 */
-  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
-  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
-  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
-  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
-  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
-  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
-  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
-  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
-  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w,
-       tmp1_w, tmp2_w, tmp3_w);
-  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
-  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r,
-       vec1_r, vec2_r, vec3_r);
-
-  tmp3_w = vec0_r + vec3_r;
-  vec0_r = vec0_r - vec3_r;
-  vec3_r = vec1_r + vec2_r;
-  vec1_r = vec1_r - vec2_r;
-
-  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
-                    vec4_r, tmp3_w, vec6_r, vec3_r);
-  FDCT32_POSTPROC_NEG_W(vec4_r);
-  FDCT32_POSTPROC_NEG_W(tmp3_w);
-  FDCT32_POSTPROC_NEG_W(vec6_r);
-  FDCT32_POSTPROC_NEG_W(vec3_r);
-  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
-  ST_SH2(vec5, vec4, out, 8);
-
-  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
-                    vec4_r, tmp3_w, vec6_r, vec3_r);
-  FDCT32_POSTPROC_NEG_W(vec4_r);
-  FDCT32_POSTPROC_NEG_W(tmp3_w);
-  FDCT32_POSTPROC_NEG_W(vec6_r);
-  FDCT32_POSTPROC_NEG_W(vec3_r);
-  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
-  ST_SH2(vec5, vec4, out + 16, 8);
-
-  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
-  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 32);
-  ST_SH(in5, out + 56);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 40);
-  ST_SH(in5, out + 48);
-
-  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 64);
-  ST_SH(in5, out + 120);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 72);
-  ST_SH(in5, out + 112);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 80);
-  ST_SH(in5, out + 104);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
-  FDCT_POSTPROC_2V_NEG_H(in4, in5);
-  ST_SH(in4, out + 96);
-  ST_SH(in5, out + 88);
-}
-
-static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
-
-  /* fdct32 even */
-  /* stage 2 */
-  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
-               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  /* Stage 3 */
-  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
-  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out);
-  ST_SH(temp1, out + 8);
-
-  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 16);
-  ST_SH(temp1, out + 24);
-
-  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 32);
-  ST_SH(temp1, out + 56);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 40);
-  ST_SH(temp1, out + 48);
-
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 64);
-  ST_SH(temp1, out + 120);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 72);
-  ST_SH(temp1, out + 112);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 80);
-  ST_SH(temp1, out + 104);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
-  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
-  ST_SH(temp0, out + 96);
-  ST_SH(temp1, out + 88);
-}
-
-static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
-                                int16_t *out) {
-  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
-  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
-
-  in20 = LD_SH(temp + 32);
-  in21 = LD_SH(temp + 40);
-  in26 = LD_SH(temp + 80);
-  in27 = LD_SH(temp + 88);
-
-  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
-  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
-  in18 = LD_SH(temp + 16);
-  in19 = LD_SH(temp + 24);
-  in28 = LD_SH(temp + 96);
-  in29 = LD_SH(temp + 104);
-
-  vec4 = in19 - in20;
-  ST_SH(vec4, interm_ptr + 32);
-  vec4 = in18 - in21;
-  ST_SH(vec4, interm_ptr + 88);
-  vec4 = in28 - in27;
-  ST_SH(vec4, interm_ptr + 56);
-  vec4 = in29 - in26;
-  ST_SH(vec4, interm_ptr + 64);
-
-  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
-
-  in22 = LD_SH(temp + 48);
-  in23 = LD_SH(temp + 56);
-  in24 = LD_SH(temp + 64);
-  in25 = LD_SH(temp + 72);
-
-  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
-  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-
-  in16 = LD_SH(temp);
-  in17 = LD_SH(temp + 8);
-  in30 = LD_SH(temp + 112);
-  in31 = LD_SH(temp + 120);
-
-  vec4 = in17 - in22;
-  ST_SH(vec4, interm_ptr + 40);
-  vec4 = in30 - in25;
-  ST_SH(vec4, interm_ptr + 48);
-  vec4 = in31 - in24;
-  ST_SH(vec4, interm_ptr + 72);
-  vec4 = in16 - in23;
-  ST_SH(vec4, interm_ptr + 80);
-
-  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
-  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
-  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-
-  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
-  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
-  ADD2(in27, in26, in25, in24, in23, in20);
-
-  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out);
-  ST_SH(vec4, out + 120);
-
-  SUB2(in27, in26, in25, in24, in22, in21);
-
-  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 112);
-  ST_SH(vec4, out + 8);
-
-  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
-  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
-  SUB2(in26, in27, in24, in25, in23, in20);
-
-  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec4, out + 16);
-  ST_SH(vec5, out + 104);
-
-  ADD2(in26, in27, in24, in25, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec4, out + 24);
-  ST_SH(vec5, out + 96);
-
-  in20 = LD_SH(interm_ptr + 32);
-  in21 = LD_SH(interm_ptr + 88);
-  in27 = LD_SH(interm_ptr + 56);
-  in26 = LD_SH(interm_ptr + 64);
-
-  in16 = in20;
-  in17 = in21;
-  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
-  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-
-  in22 = LD_SH(interm_ptr + 40);
-  in25 = LD_SH(interm_ptr + 48);
-  in24 = LD_SH(interm_ptr + 72);
-  in23 = LD_SH(interm_ptr + 80);
-
-  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
-  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
-  ADD2(in28, in29, in31, in30, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 32);
-  ST_SH(vec4, out + 88);
-
-  SUB2(in28, in29, in31, in30, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 40);
-  ST_SH(vec4, out + 80);
-
-  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
-  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
-  SUB2(in29, in28, in30, in31, in16, in19);
-
-  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec5, out + 72);
-  ST_SH(vec4, out + 48);
-
-  ADD2(in29, in28, in30, in31, in17, in18);
-
-  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
-  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
-  ST_SH(vec4, out + 56);
-  ST_SH(vec5, out + 64);
-}
-
-static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
-
-  /* 1st set */
-  in0 = LD_SH(temp);
-  in4 = LD_SH(temp + 32);
-  in2 = LD_SH(temp + 64);
-  in6 = LD_SH(temp + 96);
-  in1 = LD_SH(temp + 128);
-  in7 = LD_SH(temp + 152);
-  in3 = LD_SH(temp + 192);
-  in5 = LD_SH(temp + 216);
-
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-
-  /* 2nd set */
-  in0_1 = LD_SH(temp + 16);
-  in1_1 = LD_SH(temp + 232);
-  in2_1 = LD_SH(temp + 80);
-  in3_1 = LD_SH(temp + 168);
-  in4_1 = LD_SH(temp + 48);
-  in5_1 = LD_SH(temp + 176);
-  in6_1 = LD_SH(temp + 112);
-  in7_1 = LD_SH(temp + 240);
-
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
-  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
-
-  /* 3rd set */
-  in0 = LD_SH(temp + 8);
-  in1 = LD_SH(temp + 136);
-  in2 = LD_SH(temp + 72);
-  in3 = LD_SH(temp + 200);
-  in4 = LD_SH(temp + 40);
-  in5 = LD_SH(temp + 208);
-  in6 = LD_SH(temp + 104);
-  in7 = LD_SH(temp + 144);
-
-  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8,
-         32);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
-
-  /* 4th set */
-  in0_1 = LD_SH(temp + 24);
-  in1_1 = LD_SH(temp + 224);
-  in2_1 = LD_SH(temp + 88);
-  in3_1 = LD_SH(temp + 160);
-  in4_1 = LD_SH(temp + 56);
-  in5_1 = LD_SH(temp + 184);
-  in6_1 = LD_SH(temp + 120);
-  in7_1 = LD_SH(temp + 248);
-
-  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
-                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
-  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24,
-         32);
-}
-
-static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
-  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
-  fdct8x32_1d_row_even(temp_buf, temp_buf);
-  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
-  fdct8x32_1d_row_transpose_store(temp_buf, output);
-}
-
-static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
-                               int16_t *output) {
-  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
-  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
-  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
-  fdct8x32_1d_row_transpose_store(tmp_buf, output);
-}
-
-void aom_fdct32x32_msa(const int16_t *input, int16_t *output,
-                       int32_t src_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
-
-  /* column transform */
-  for (i = 0; i < 4; ++i) {
-    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
-                       tmp_buf_big + (8 * i));
-  }
-
-  /* row transform */
-  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
-
-  /* row transform */
-  for (i = 1; i < 4; ++i) {
-    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
-  }
-}
-
-static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
-
-  /* fdct32 even */
-  /* stage 2 */
-  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
-
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6,
-               vec7, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
-  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
-  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
-  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
-  FDCT_POSTPROC_2V_NEG_H(in8, in9);
-  FDCT_POSTPROC_2V_NEG_H(in10, in11);
-  FDCT_POSTPROC_2V_NEG_H(in12, in13);
-  FDCT_POSTPROC_2V_NEG_H(in14, in15);
-
-  /* Stage 3 */
-  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
-
-  temp0 = in0 + in3;
-  in0 = in0 - in3;
-  in3 = in1 + in2;
-  in1 = in1 - in2;
-
-  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
-  ST_SH(temp0, out);
-  ST_SH(temp1, out + 8);
-
-  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
-  ST_SH(temp0, out + 16);
-  ST_SH(temp1, out + 24);
-
-  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
-  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
-  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
-  ST_SH(temp0, out + 32);
-  ST_SH(temp1, out + 56);
-
-  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
-  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
-  ST_SH(temp0, out + 40);
-  ST_SH(temp1, out + 48);
-
-  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
-  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
-  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
-  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
-  ADD2(in0, in1, in2, in3, vec0, vec7);
-  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
-  ST_SH(temp0, out + 64);
-  ST_SH(temp1, out + 120);
-
-  SUB2(in0, in1, in2, in3, in0, in2);
-  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
-  ST_SH(temp0, out + 72);
-  ST_SH(temp1, out + 112);
-
-  SUB2(in9, vec2, in14, vec5, vec2, vec5);
-  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
-  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
-  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
-  ST_SH(temp0, out + 80);
-  ST_SH(temp1, out + 104);
-
-  ADD2(in3, in2, in0, in1, vec3, vec4);
-  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
-  ST_SH(temp0, out + 96);
-  ST_SH(temp1, out + 88);
-}
-
-static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
-                                   int16_t *out) {
-  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
-  v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
-  v8i16 vec4, vec5;
-
-  in20 = LD_SH(temp + 32);
-  in21 = LD_SH(temp + 40);
-  in26 = LD_SH(temp + 80);
-  in27 = LD_SH(temp + 88);
-
-  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
-  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
-
-  FDCT_POSTPROC_2V_NEG_H(in20, in21);
-  FDCT_POSTPROC_2V_NEG_H(in26, in27);
-
-  in18 = LD_SH(temp + 16);
-  in19 = LD_SH(temp + 24);
-  in28 = LD_SH(temp + 96);
-  in29 = LD_SH(temp + 104);
-
-  FDCT_POSTPROC_2V_NEG_H(in18, in19);
-  FDCT_POSTPROC_2V_NEG_H(in28, in29);
-
-  vec4 = in19 - in20;
-  ST_SH(vec4, interm_ptr + 32);
-  vec4 = in18 - in21;
-  ST_SH(vec4, interm_ptr + 88);
-  vec4 = in29 - in26;
-  ST_SH(vec4, interm_ptr + 64);
-  vec4 = in28 - in27;
-  ST_SH(vec4, interm_ptr + 56);
-
-  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
-
-  in22 = LD_SH(temp + 48);
-  in23 = LD_SH(temp + 56);
-  in24 = LD_SH(temp + 64);
-  in25 = LD_SH(temp + 72);
-
-  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
-  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
-  FDCT_POSTPROC_2V_NEG_H(in22, in23);
-  FDCT_POSTPROC_2V_NEG_H(in24, in25);
-
-  in16 = LD_SH(temp);
-  in17 = LD_SH(temp + 8);
-  in30 = LD_SH(temp + 112);
-  in31 = LD_SH(temp + 120);
-
-  FDCT_POSTPROC_2V_NEG_H(in16, in17);
-  FDCT_POSTPROC_2V_NEG_H(in30, in31);
-
-  vec4 = in17 - in22;
-  ST_SH(vec4, interm_ptr + 40);
-  vec4 = in30 - in25;
-  ST_SH(vec4, interm_ptr + 48);
-  vec4 = in31 - in24;
-  ST_SH(vec4, interm_ptr + 72);
-  vec4 = in16 - in23;
-  ST_SH(vec4, interm_ptr + 80);
-
-  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
-  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
-  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
-  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
-  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
-  ADD2(in27, in26, in25, in24, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
-  ST_SH(vec5, out);
-  ST_SH(vec4, out + 120);
-
-  SUB2(in27, in26, in25, in24, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
-  ST_SH(vec5, out + 112);
-  ST_SH(vec4, out + 8);
-
-  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
-  DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
-  SUB2(in26, in27, in24, in25, in23, in20);
-  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
-  ST_SH(vec4, out + 16);
-  ST_SH(vec5, out + 104);
-
-  ADD2(in26, in27, in24, in25, in22, in21);
-  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
-  ST_SH(vec4, out + 24);
-  ST_SH(vec5, out + 96);
-
-  in20 = LD_SH(interm_ptr + 32);
-  in21 = LD_SH(interm_ptr + 88);
-  in27 = LD_SH(interm_ptr + 56);
-  in26 = LD_SH(interm_ptr + 64);
-
-  in16 = in20;
-  in17 = in21;
-  DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
-  DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
-
-  in22 = LD_SH(interm_ptr + 40);
-  in25 = LD_SH(interm_ptr + 48);
-  in24 = LD_SH(interm_ptr + 72);
-  in23 = LD_SH(interm_ptr + 80);
-
-  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
-  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
-  in16 = in28 + in29;
-  in19 = in31 + in30;
-  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
-  ST_SH(vec5, out + 32);
-  ST_SH(vec4, out + 88);
-
-  SUB2(in28, in29, in31, in30, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
-  ST_SH(vec5, out + 40);
-  ST_SH(vec4, out + 80);
-
-  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
-  DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
-  SUB2(in29, in28, in30, in31, in16, in19);
-  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
-  ST_SH(vec5, out + 72);
-  ST_SH(vec4, out + 48);
-
-  ADD2(in29, in28, in30, in31, in17, in18);
-  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
-  ST_SH(vec4, out + 56);
-  ST_SH(vec5, out + 64);
-}
-
-static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
-                               int16_t *output) {
-  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
-  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
-  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
-  fdct8x32_1d_row_transpose_store(tmp_buf, output);
-}
-
-void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
-                          int32_t src_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
-
-  /* column transform */
-  for (i = 0; i < 4; ++i) {
-    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
-                       &tmp_buf_big[0] + (8 * i));
-  }
-
-  /* row transform */
-  for (i = 0; i < 4; ++i) {
-    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
-                       out + (8 * i * 32));
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
deleted file mode 100644
index 7a285b7b8..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/fwd_txfm_msa.h"
-
-void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
-                        int32_t src_stride) {
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
-  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
-  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,
-                  -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
-  v8i16 coeff1 = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,
-                   cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };
-  v8i16 coeff2 = {
-    -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0
-  };
-
-  LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,
-          in10, in11, in12, in13, in14, in15);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  SLLI_4V(in8, in9, in10, in11, 2);
-  SLLI_4V(in12, in13, in14, in15, 2);
-  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
-  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
-                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
-  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
-  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
-
-  tmp_ptr += 16;
-
-  /* stp 1 */
-  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
-  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
-
-  cnst4 = __msa_splati_h(coeff, 0);
-  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
-
-  cnst5 = __msa_splati_h(coeff, 1);
-  cnst5 = __msa_ilvev_h(cnst5, cnst4);
-  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
-  stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
-  stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
-
-  /* stp2 */
-  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
-  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
-  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
-  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
-  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
-
-  cnst0 = __msa_splati_h(coeff, 4);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-  stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
-
-  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
-  ILVRL_H2_SH(in15, in8, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr);
-
-  cnst0 = __msa_splati_h(coeff2, 0);
-  cnst0 = __msa_ilvev_h(cnst1, cnst0);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 224);
-
-  ILVRL_H2_SH(in14, in9, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
-  ST_SH(in8, tmp_ptr + 128);
-
-  cnst1 = __msa_splati_h(coeff2, 2);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 96);
-
-  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
-  stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
-
-  cnst1 = __msa_splati_h(coeff, 3);
-  cnst1 = __msa_ilvev_h(cnst0, cnst1);
-  stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
-
-  /* stp4 */
-  ADD2(stp34, stp25, stp33, stp22, in13, in10);
-
-  ILVRL_H2_SH(in13, in10, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 64);
-
-  cnst0 = __msa_splati_h(coeff2, 1);
-  cnst0 = __msa_ilvev_h(cnst1, cnst0);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 160);
-
-  SUB2(stp34, stp25, stp33, stp22, in12, in11);
-  ILVRL_H2_SH(in12, in11, vec1, vec0);
-  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
-  cnst1 = __msa_ilvev_h(cnst1, cnst0);
-
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
-  ST_SH(in8, tmp_ptr + 192);
-
-  cnst1 = __msa_splati_h(coeff2, 3);
-  cnst0 = __msa_ilvev_h(cnst0, cnst1);
-  in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
-  ST_SH(in8, tmp_ptr + 32);
-}
-
-void fdct16x8_1d_row(int16_t *input, int16_t *output) {
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
-
-  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
-  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                     in10, in11, in12, in13, in14, in15);
-  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
-  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
-  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
-  ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
-  SRA_4V(in0, in1, in2, in3, 2);
-  SRA_4V(in4, in5, in6, in7, 2);
-  SRA_4V(in8, in9, in10, in11, 2);
-  SRA_4V(in12, in13, in14, in15, 2);
-  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
-               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
-               tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
-  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
-  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
-                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
-  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
-               in4, in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
-                     tmp1, in1, tmp2, in2, tmp3, in3);
-  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
-  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
-                     tmp5, in5, tmp6, in6, tmp7, in7);
-  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
-}
-
-void aom_fdct4x4_msa(const int16_t *input, int16_t *output,
-                     int32_t src_stride) {
-  v8i16 in0, in1, in2, in3;
-
-  LD_SH4(input, src_stride, in0, in1, in2, in3);
-
-  /* fdct4 pre-process */
-  {
-    v8i16 vec, mask;
-    v16i8 zero = { 0 };
-    v16i8 one = __msa_ldi_b(1);
-
-    mask = (v8i16)__msa_sldi_b(zero, one, 15);
-    SLLI_4V(in0, in1, in2, in3, 4);
-    vec = __msa_ceqi_h(in0, 0);
-    vec = vec ^ 255;
-    vec = mask & vec;
-    in0 += vec;
-  }
-
-  AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
-  SRA_4V(in0, in1, in2, in3, 2);
-  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
-  ST_SH2(in0, in2, output, 8);
-}
-
-void aom_fdct8x8_msa(const int16_t *input, int16_t *output,
-                     int32_t src_stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
-  LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7);
-  SLLI_4V(in0, in1, in2, in3, 2);
-  SLLI_4V(in4, in5, in6, in7, 2);
-  AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-            in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
-            in5, in6, in7);
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
-  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
-}
-
-void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[0] = LD_HADD(input, stride);
-  out[1] = 0;
-}
-
-void aom_fdct16x16_msa(const int16_t *input, int16_t *output,
-                       int32_t src_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
-
-  /* column transform */
-  for (i = 0; i < 2; ++i) {
-    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
-  }
-
-  /* row transform */
-  for (i = 0; i < 2; ++i) {
-    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
deleted file mode 100644
index ada25dffd..000000000
--- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_
-#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_
-
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_dsp/txfm_common.h"
-
-#define LD_HADD(psrc, stride)                                                  \
-  ({                                                                           \
-    v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;              \
-    v4i32 vec_w_m;                                                             \
-                                                                               \
-    LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                        \
-    ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                            \
-    LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);         \
-    ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
-         in0_m, in4_m);                                                        \
-    in0_m += in4_m;                                                            \
-                                                                               \
-    vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                                    \
-    HADD_SW_S32(vec_w_m);                                                      \
-  })
-
-#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                  \
-  {                                                                            \
-    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                                  \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                                      \
-    v4i32 vec4_m, vec5_m, vec6_m, vec7_m;                                      \
-    v8i16 coeff_m = {                                                          \
-      cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \
-    };                                                                         \
-                                                                               \
-    BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);           \
-    ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m);                \
-    SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m);                             \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m);                                  \
-                                                                               \
-    SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m);                             \
-    cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m);                                 \
-    vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
-                                                                               \
-    vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m);                                  \
-    cnst2_m = __msa_splati_h(coeff_m, 2);                                      \
-    cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m);                                 \
-    vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m);                                  \
-                                                                               \
-    SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS);               \
-    PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m,        \
-                vec7_m, out0, out2, out1, out3);                               \
-  }
-
-#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)              \
-  {                                                                          \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-                                                                             \
-    SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15);      \
-    SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15);      \
-    AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \
-               in2, in3);                                                    \
-    AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \
-               in6, in7);                                                    \
-  }
-
-#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
-                  out3, out4, out5, out6, out7)                              \
-  {                                                                          \
-    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                          \
-    v8i16 s7_m, x0_m, x1_m, x2_m, x3_m;                                      \
-    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
-                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
-                                                                             \
-    /* FDCT stage1 */                                                        \
-    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
-                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
-    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
-    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
-    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
-    SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                 \
-    x2_m = -x2_m;                                                            \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    x2_m = __msa_splati_h(coeff_m, 2);                                       \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    /* stage2 */                                                             \
-    ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                     \
-                                                                             \
-    s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    /* stage3 */                                                             \
-    BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);             \
-                                                                             \
-    /* stage4 */                                                             \
-    ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                          \
-    ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x0_m, x1_m);                                        \
-    out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                 \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-                                                                             \
-    x1_m = __msa_splati_h(coeff_m, 5);                                       \
-    x0_m = -x0_m;                                                            \
-    x0_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                          \
-                                                                             \
-    x2_m = __msa_splati_h(coeff_m, 6);                                       \
-    x3_m = -x3_m;                                                            \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-  }
-
-#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
-                      out2, out3, out4, out5, out6, out7)                    \
-  {                                                                          \
-    v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                    \
-    v8i16 x0_m, x1_m, x2_m, x3_m;                                            \
-    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,   \
-                      cospi_4_64,  cospi_28_64,  cospi_12_64, cospi_20_64 }; \
-                                                                             \
-    /* FDCT stage1 */                                                        \
-    BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m,    \
-                s3_m, s4_m, s5_m, s6_m, s7_m);                               \
-    BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);             \
-    ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                          \
-    ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                          \
-    SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                 \
-    x2_m = -x2_m;                                                            \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    x2_m = __msa_splati_h(coeff_m, 2);                                       \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                          \
-                                                                             \
-    /* stage2 */                                                             \
-    ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                     \
-                                                                             \
-    s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                          \
-    s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                          \
-                                                                             \
-    /* stage3 */                                                             \
-    BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);             \
-                                                                             \
-    /* stage4 */                                                             \
-    ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                          \
-    ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                 \
-    x1_m = __msa_ilvev_h(x0_m, x1_m);                                        \
-    out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                          \
-                                                                             \
-    SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                 \
-    x2_m = __msa_ilvev_h(x3_m, x2_m);                                        \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-                                                                             \
-    x1_m = __msa_splati_h(coeff_m, 5);                                       \
-    x0_m = -x0_m;                                                            \
-    x0_m = __msa_ilvev_h(x1_m, x0_m);                                        \
-    out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                          \
-                                                                             \
-    x2_m = __msa_splati_h(coeff_m, 6);                                       \
-    x3_m = -x3_m;                                                            \
-    x2_m = __msa_ilvev_h(x2_m, x3_m);                                        \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                          \
-  }
-
-#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,   \
-                     input7, out1, out3, out5, out7, out9, out11, out13,       \
-                     out15)                                                    \
-  {                                                                            \
-    v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;                \
-    v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;                \
-    v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                                    \
-    v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                              \
-    v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                                  \
-    v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,  cospi_24_64,     \
-                      -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };   \
-    v8i16 coeff1_m = { cospi_2_64,  cospi_30_64, cospi_14_64, cospi_18_64,     \
-                       cospi_10_64, cospi_22_64, cospi_6_64,  cospi_26_64 };   \
-    v8i16 coeff2_m = {                                                         \
-      -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0        \
-    };                                                                         \
-                                                                               \
-    /* stp 1 */                                                                \
-    ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);                \
-    ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);                \
-                                                                               \
-    cnst4_m = __msa_splati_h(coeff_m, 0);                                      \
-    stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);                  \
-                                                                               \
-    cnst5_m = __msa_splati_h(coeff_m, 1);                                      \
-    cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                                 \
-    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);                  \
-    stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);                  \
-    stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);                  \
-                                                                               \
-    /* stp2 */                                                                 \
-    BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m,   \
-                stp33_m);                                                      \
-    BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m,   \
-                stp34_m);                                                      \
-                                                                               \
-    ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);            \
-    ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);            \
-                                                                               \
-    SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                             \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                  \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff_m, 4);                                      \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                  \
-                                                                               \
-    SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                             \
-    cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff_m, 3);                                      \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);                  \
-                                                                               \
-    /* stp4 */                                                                 \
-    BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m,    \
-                vec5_m);                                                       \
-    BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \
-                stp31_m);                                                      \
-                                                                               \
-    ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                               \
-    SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                            \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-                                                                               \
-    out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff2_m, 0);                                     \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
-                                                                               \
-    ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                               \
-    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                            \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-                                                                               \
-    out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                     \
-                                                                               \
-    cnst1_m = __msa_splati_h(coeff2_m, 2);                                     \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-                                                                               \
-    ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                             \
-    SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                            \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-                                                                               \
-    cnst0_m = __msa_splati_h(coeff2_m, 1);                                     \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-    out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                    \
-                                                                               \
-    ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                             \
-    SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                            \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                                 \
-                                                                               \
-    out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                    \
-                                                                               \
-    cnst1_m = __msa_splati_h(coeff2_m, 3);                                     \
-    cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                                 \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                     \
-  }
-
-#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
-  {                                        \
-    v8i16 tp0_m, tp1_m;                    \
-    v8i16 one_m = __msa_ldi_h(1);          \
-                                           \
-    tp0_m = __msa_clti_s_h(vec0, 0);       \
-    tp1_m = __msa_clti_s_h(vec1, 0);       \
-    vec0 += 1;                             \
-    vec1 += 1;                             \
-    tp0_m = one_m & tp0_m;                 \
-    tp1_m = one_m & tp1_m;                 \
-    vec0 += tp0_m;                         \
-    vec1 += tp1_m;                         \
-    vec0 >>= 2;                            \
-    vec1 >>= 2;                            \
-  }
-
-#define FDCT32_POSTPROC_NEG_W(vec)   \
-  {                                  \
-    v4i32 temp_m;                    \
-    v4i32 one_m = __msa_ldi_w(1);    \
-                                     \
-    temp_m = __msa_clti_s_w(vec, 0); \
-    vec += 1;                        \
-    temp_m = one_m & temp_m;         \
-    vec += temp_m;                   \
-    vec >>= 2;                       \
-  }
-
-#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1)        \
-  {                                                 \
-    v8i16 tp0_m, tp1_m;                             \
-    v8i16 one = __msa_ldi_h(1);                     \
-                                                    \
-    tp0_m = __msa_clei_s_h(vec0, 0);                \
-    tp1_m = __msa_clei_s_h(vec1, 0);                \
-    tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
-    tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
-    vec0 += 1;                                      \
-    vec1 += 1;                                      \
-    tp0_m = one & tp0_m;                            \
-    tp1_m = one & tp1_m;                            \
-    vec0 += tp0_m;                                  \
-    vec1 += tp1_m;                                  \
-    vec0 >>= 2;                                     \
-    vec1 >>= 2;                                     \
-  }
-
-#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
-                          const0, const1, out0, out1, out2, out3)       \
-  {                                                                     \
-    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
-    v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                                   \
-    v4i32 k0_m = __msa_fill_w((int32_t)const0);                         \
-                                                                        \
-    s0_m = __msa_fill_w((int32_t)const1);                               \
-    k0_m = __msa_ilvev_w(s0_m, k0_m);                                   \
-                                                                        \
-    ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                     \
-    ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                      \
-    ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);                   \
-    ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);                    \
-                                                                        \
-    DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);                  \
-    DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);                  \
-    tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                       \
-    tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                       \
-    tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                       \
-    tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                       \
-    out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);                   \
-    out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);                   \
-                                                                        \
-    DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);                  \
-    DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);                  \
-    tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                       \
-    tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                       \
-    tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                       \
-    tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                       \
-    out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);                   \
-    out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);                   \
-  }
-
-void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
-                        int32_t src_stride);
-void fdct16x8_1d_row(int16_t *input, int16_t *output);
-#endif  // AOM_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c
deleted file mode 100644
index 0ea127f52..000000000
--- a/third_party/aom/aom_dsp/mips/idct16x16_msa.c
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
-  v8i16 loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
-  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
-  v8i16 tmp5, tmp6, tmp7;
-
-  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  input += 8;
-  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-
-  TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1,
-                     reg2, reg3, reg4, reg5, reg6, reg7);
-  TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8,
-                     reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
-  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
-  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
-  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
-  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
-  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
-  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
-  SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4,
-       reg8);
-  ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6,
-       reg10);
-
-  /* stage 2 */
-  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
-  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
-
-  reg9 = reg1 - loc2;
-  reg1 = reg1 + loc2;
-  reg7 = reg15 - loc3;
-  reg15 = reg15 + loc3;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
-  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
-  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
-
-  loc1 = reg15 + reg3;
-  reg3 = reg15 - reg3;
-  loc2 = reg2 + loc1;
-  reg15 = reg2 - loc1;
-
-  loc1 = reg1 + reg13;
-  reg13 = reg1 - reg13;
-  loc0 = reg0 + loc1;
-  loc1 = reg0 - loc1;
-  tmp6 = loc0;
-  tmp7 = loc1;
-  reg0 = loc2;
-
-  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
-  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
-
-  loc0 = reg9 + reg5;
-  reg5 = reg9 - reg5;
-  reg2 = reg6 + loc0;
-  reg1 = reg6 - loc0;
-
-  loc0 = reg7 + reg11;
-  reg11 = reg7 - reg11;
-  loc1 = reg4 + loc0;
-  loc2 = reg4 - loc0;
-  tmp5 = loc1;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
-  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
-
-  reg10 = loc0;
-  reg11 = loc1;
-
-  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
-  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
-
-  reg13 = loc2;
-
-  /* Transpose and store the output */
-  reg12 = tmp5;
-  reg14 = tmp6;
-  reg3 = tmp7;
-
-  /* transpose block */
-  TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0,
-                     reg2, reg4, reg6, reg8, reg10, reg12, reg14);
-  ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16);
-
-  /* transpose block */
-  TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3,
-                     reg13, reg11, reg5, reg7, reg9, reg1, reg15);
-  ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16);
-}
-
-void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                      int32_t dst_stride) {
-  v8i16 loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
-  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
-  v8i16 tmp5, tmp6, tmp7;
-
-  /* load up 8x8 */
-  LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  input += 8 * 16;
-  /* load bottom 8x8 */
-  LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
-
-  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
-  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
-  BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2);
-  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
-  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
-  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
-  BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14);
-
-  reg0 = reg2 - loc1;
-  reg2 = reg2 + loc1;
-  reg12 = reg14 - loc0;
-  reg14 = reg14 + loc0;
-  reg4 = reg6 - loc3;
-  reg6 = reg6 + loc3;
-  reg8 = reg10 - loc2;
-  reg10 = reg10 + loc2;
-
-  /* stage 2 */
-  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
-  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
-
-  reg9 = reg1 - loc2;
-  reg1 = reg1 + loc2;
-  reg7 = reg15 - loc3;
-  reg15 = reg15 + loc3;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
-  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
-  BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5);
-
-  loc1 = reg15 + reg3;
-  reg3 = reg15 - reg3;
-  loc2 = reg2 + loc1;
-  reg15 = reg2 - loc1;
-
-  loc1 = reg1 + reg13;
-  reg13 = reg1 - reg13;
-  loc0 = reg0 + loc1;
-  loc1 = reg0 - loc1;
-  tmp6 = loc0;
-  tmp7 = loc1;
-  reg0 = loc2;
-
-  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
-  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
-
-  loc0 = reg9 + reg5;
-  reg5 = reg9 - reg5;
-  reg2 = reg6 + loc0;
-  reg1 = reg6 - loc0;
-
-  loc0 = reg7 + reg11;
-  reg11 = reg7 - reg11;
-  loc1 = reg4 + loc0;
-  loc2 = reg4 - loc0;
-  tmp5 = loc1;
-
-  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
-  BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1);
-
-  reg10 = loc0;
-  reg11 = loc1;
-
-  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
-  BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5);
-  reg13 = loc2;
-
-  /* Transpose and store the output */
-  reg12 = tmp5;
-  reg14 = tmp6;
-  reg3 = tmp7;
-
-  SRARI_H4_SH(reg0, reg2, reg4, reg6, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6);
-  dst += (4 * dst_stride);
-  SRARI_H4_SH(reg8, reg10, reg12, reg14, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14);
-  dst += (4 * dst_stride);
-  SRARI_H4_SH(reg3, reg13, reg11, reg5, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5);
-  dst += (4 * dst_stride);
-  SRARI_H4_SH(reg7, reg9, reg1, reg15, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15);
-}
-
-void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst,
-                               int32_t dst_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
-  int16_t *out = out_arr;
-
-  /* transform rows */
-  for (i = 0; i < 2; ++i) {
-    /* process 16 * 8 block */
-    aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
-  }
-
-  /* transform columns */
-  for (i = 0; i < 2; ++i) {
-    /* process 8 * 16 block */
-    aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
-                                     dst_stride);
-  }
-}
-
-void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst,
-                              int32_t dst_stride) {
-  uint8_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
-  int16_t *out = out_arr;
-
-  /* process 16 * 8 block */
-  aom_idct16_1d_rows_msa(input, out);
-
-  /* short case just considers top 4 rows as valid output */
-  out += 4 * 16;
-  for (i = 12; i--;) {
-    __asm__ __volatile__(
-        "sw     $zero,   0(%[out])     \n\t"
-        "sw     $zero,   4(%[out])     \n\t"
-        "sw     $zero,   8(%[out])     \n\t"
-        "sw     $zero,  12(%[out])     \n\t"
-        "sw     $zero,  16(%[out])     \n\t"
-        "sw     $zero,  20(%[out])     \n\t"
-        "sw     $zero,  24(%[out])     \n\t"
-        "sw     $zero,  28(%[out])     \n\t"
-
-        :
-        : [out] "r"(out));
-
-    out += 16;
-  }
-
-  out = out_arr;
-
-  /* transform columns */
-  for (i = 0; i < 2; ++i) {
-    /* process 8 * 16 block */
-    aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)),
-                                     dst_stride);
-  }
-}
-
-void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst,
-                             int32_t dst_stride) {
-  uint8_t i;
-  int16_t out;
-  v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7;
-  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO(out, 6);
-
-  vec = __msa_fill_h(out);
-
-  for (i = 4; i--;) {
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    UNPCK_UB_SH(dst0, res0, res4);
-    UNPCK_UB_SH(dst1, res1, res5);
-    UNPCK_UB_SH(dst2, res2, res6);
-    UNPCK_UB_SH(dst3, res3, res7);
-    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
-    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
-    CLIP_SH4_0_255(res0, res1, res2, res3);
-    CLIP_SH4_0_255(res4, res5, res6, res7);
-    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
-                tmp2, tmp3);
-    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
-
-  /* load input data */
-  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
-          l7, l15);
-  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6,
-                     l7);
-  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11,
-                     l12, l13, l14, l15);
-
-  /* ADST in horizontal */
-  AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13,
-                   l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,
-                   r12, r13, r14, r15);
-
-  l1 = -r8;
-  l3 = -r4;
-  l13 = -r13;
-  l15 = -r1;
-
-  TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5,
-                     l6, l7);
-  ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16);
-  TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12,
-                     l13, l14, l15);
-  ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16);
-}
-
-void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                       int32_t dst_stride) {
-  v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
-  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
-  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
-  v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
-  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
-  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
-  v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
-  v16i8 zero = { 0 };
-
-  r0 = LD_SH(input + 0 * 16);
-  r3 = LD_SH(input + 3 * 16);
-  r4 = LD_SH(input + 4 * 16);
-  r7 = LD_SH(input + 7 * 16);
-  r8 = LD_SH(input + 8 * 16);
-  r11 = LD_SH(input + 11 * 16);
-  r12 = LD_SH(input + 12 * 16);
-  r15 = LD_SH(input + 15 * 16);
-
-  /* stage 1 */
-  k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
-  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
-  k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
-  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
-  BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
-  k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
-  k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
-  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
-
-  r1 = LD_SH(input + 1 * 16);
-  r2 = LD_SH(input + 2 * 16);
-  r5 = LD_SH(input + 5 * 16);
-  r6 = LD_SH(input + 6 * 16);
-  r9 = LD_SH(input + 9 * 16);
-  r10 = LD_SH(input + 10 * 16);
-  r13 = LD_SH(input + 13 * 16);
-  r14 = LD_SH(input + 14 * 16);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
-  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
-  k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
-  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
-  BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
-  BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
-  out1 = -out1;
-  SRARI_H2_SH(out0, out1, 6);
-  dst0 = LD_UB(dst + 0 * dst_stride);
-  dst1 = LD_UB(dst + 15 * dst_stride);
-  ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1);
-  ADD2(res0, out0, res1, out1, res0, res1);
-  CLIP_SH2_0_255(res0, res1);
-  PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
-  ST8x1_UB(res0, dst);
-  ST8x1_UB(res1, dst + 15 * dst_stride);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
-  k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
-  MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
-  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
-  out8 = -out8;
-
-  SRARI_H2_SH(out8, out9, 6);
-  dst8 = LD_UB(dst + 1 * dst_stride);
-  dst9 = LD_UB(dst + 14 * dst_stride);
-  ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9);
-  ADD2(res8, out8, res9, out9, res8, res9);
-  CLIP_SH2_0_255(res8, res9);
-  PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
-  ST8x1_UB(res8, dst + dst_stride);
-  ST8x1_UB(res9, dst + 14 * dst_stride);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
-  k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
-  MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
-  out4 = -out4;
-  SRARI_H2_SH(out4, out5, 6);
-  dst4 = LD_UB(dst + 3 * dst_stride);
-  dst5 = LD_UB(dst + 12 * dst_stride);
-  ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5);
-  ADD2(res4, out4, res5, out5, res4, res5);
-  CLIP_SH2_0_255(res4, res5);
-  PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
-  ST8x1_UB(res4, dst + 3 * dst_stride);
-  ST8x1_UB(res5, dst + 12 * dst_stride);
-
-  MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
-  out13 = -out13;
-  SRARI_H2_SH(out12, out13, 6);
-  dst12 = LD_UB(dst + 2 * dst_stride);
-  dst13 = LD_UB(dst + 13 * dst_stride);
-  ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13);
-  ADD2(res12, out12, res13, out13, res12, res13);
-  CLIP_SH2_0_255(res12, res13);
-  PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
-  ST8x1_UB(res12, dst + 2 * dst_stride);
-  ST8x1_UB(res13, dst + 13 * dst_stride);
-
-  k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
-  k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
-  MADD_SHORT(out6, out7, k0, k3, out6, out7);
-  SRARI_H2_SH(out6, out7, 6);
-  dst6 = LD_UB(dst + 4 * dst_stride);
-  dst7 = LD_UB(dst + 11 * dst_stride);
-  ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7);
-  ADD2(res6, out6, res7, out7, res6, res7);
-  CLIP_SH2_0_255(res6, res7);
-  PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
-  ST8x1_UB(res6, dst + 4 * dst_stride);
-  ST8x1_UB(res7, dst + 11 * dst_stride);
-
-  MADD_SHORT(out10, out11, k0, k3, out10, out11);
-  SRARI_H2_SH(out10, out11, 6);
-  dst10 = LD_UB(dst + 6 * dst_stride);
-  dst11 = LD_UB(dst + 9 * dst_stride);
-  ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11);
-  ADD2(res10, out10, res11, out11, res10, res11);
-  CLIP_SH2_0_255(res10, res11);
-  PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
-  ST8x1_UB(res10, dst + 6 * dst_stride);
-  ST8x1_UB(res11, dst + 9 * dst_stride);
-
-  k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
-  MADD_SHORT(h10, h11, k1, k2, out2, out3);
-  SRARI_H2_SH(out2, out3, 6);
-  dst2 = LD_UB(dst + 7 * dst_stride);
-  dst3 = LD_UB(dst + 8 * dst_stride);
-  ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3);
-  ADD2(res2, out2, res3, out3, res2, res3);
-  CLIP_SH2_0_255(res2, res3);
-  PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
-  ST8x1_UB(res2, dst + 7 * dst_stride);
-  ST8x1_UB(res3, dst + 8 * dst_stride);
-
-  MADD_SHORT(out14, out15, k1, k2, out14, out15);
-  SRARI_H2_SH(out14, out15, 6);
-  dst14 = LD_UB(dst + 5 * dst_stride);
-  dst15 = LD_UB(dst + 10 * dst_stride);
-  ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15);
-  ADD2(res14, out14, res15, out15, res14, res15);
-  CLIP_SH2_0_255(res14, res15);
-  PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
-  ST8x1_UB(res14, dst + 5 * dst_stride);
-  ST8x1_UB(res15, dst + 10 * dst_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c
deleted file mode 100644
index f1ca757a0..000000000
--- a/third_party/aom/aom_dsp/mips/idct32x32_msa.c
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-static void idct32x8_row_transpose_store(const int16_t *input,
-                                         int16_t *tmp_buf) {
-  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
-  /* 1st & 2nd 8x8 */
-  LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
-  LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8);
-  ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8);
-  ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8);
-
-  /* 3rd & 4th 8x8 */
-  LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
-  LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8);
-  ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8);
-  ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8);
-  ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8);
-}
-
-static void idct32x8_row_even_process_store(int16_t *tmp_buf,
-                                            int16_t *tmp_eve_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
-
-  /* Even stage 1 */
-  LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-
-  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
-  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
-  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
-  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-
-  loc1 = vec3;
-  loc0 = vec1;
-
-  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
-  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
-  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
-  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
-  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
-
-  /* Even stage 2 */
-  LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
-
-  vec0 = reg0 + reg4;
-  reg0 = reg0 - reg4;
-  reg4 = reg6 + reg2;
-  reg6 = reg6 - reg2;
-  reg2 = reg1 + reg5;
-  reg1 = reg1 - reg5;
-  reg5 = reg7 + reg3;
-  reg7 = reg7 - reg3;
-  reg3 = vec0;
-
-  vec1 = reg2;
-  reg2 = reg3 + reg4;
-  reg3 = reg3 - reg4;
-  reg4 = reg5 - vec1;
-  reg5 = reg5 + vec1;
-
-  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
-  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
-
-  vec0 = reg0 - reg6;
-  reg0 = reg0 + reg6;
-  vec1 = reg7 - reg1;
-  reg7 = reg7 + reg1;
-
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
-
-  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
-  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 15 * 8));
-  ST_SH(loc1, (tmp_eve_buf));
-  ST_SH(loc2, (tmp_eve_buf + 14 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 8));
-
-  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 13 * 8));
-  ST_SH(loc1, (tmp_eve_buf + 2 * 8));
-  ST_SH(loc2, (tmp_eve_buf + 12 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 3 * 8));
-
-  /* Store 8 */
-  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 11 * 8));
-  ST_SH(loc1, (tmp_eve_buf + 4 * 8));
-  ST_SH(loc2, (tmp_eve_buf + 10 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 5 * 8));
-
-  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
-  ST_SH(loc0, (tmp_eve_buf + 9 * 8));
-  ST_SH(loc1, (tmp_eve_buf + 6 * 8));
-  ST_SH(loc2, (tmp_eve_buf + 8 * 8));
-  ST_SH(loc3, (tmp_eve_buf + 7 * 8));
-}
-
-static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
-                                           int16_t *tmp_odd_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-
-  /* Odd stage 1 */
-  reg0 = LD_SH(tmp_buf + 8);
-  reg1 = LD_SH(tmp_buf + 7 * 8);
-  reg2 = LD_SH(tmp_buf + 9 * 8);
-  reg3 = LD_SH(tmp_buf + 15 * 8);
-  reg4 = LD_SH(tmp_buf + 17 * 8);
-  reg5 = LD_SH(tmp_buf + 23 * 8);
-  reg6 = LD_SH(tmp_buf + 25 * 8);
-  reg7 = LD_SH(tmp_buf + 31 * 8);
-
-  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
-
-  vec0 = reg0 + reg3;
-  reg0 = reg0 - reg3;
-  reg3 = reg7 + reg4;
-  reg7 = reg7 - reg4;
-  reg4 = reg1 + reg2;
-  reg1 = reg1 - reg2;
-  reg2 = reg6 + reg5;
-  reg6 = reg6 - reg5;
-  reg5 = vec0;
-
-  /* 4 Stores */
-  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
-
-  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf), 8);
-
-  /* 4 Stores */
-  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
-  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
-
-  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
-  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
-
-  /* Odd stage 2 */
-  /* 8 loads */
-  reg0 = LD_SH(tmp_buf + 3 * 8);
-  reg1 = LD_SH(tmp_buf + 5 * 8);
-  reg2 = LD_SH(tmp_buf + 11 * 8);
-  reg3 = LD_SH(tmp_buf + 13 * 8);
-  reg4 = LD_SH(tmp_buf + 19 * 8);
-  reg5 = LD_SH(tmp_buf + 21 * 8);
-  reg6 = LD_SH(tmp_buf + 27 * 8);
-  reg7 = LD_SH(tmp_buf + 29 * 8);
-
-  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
-  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
-
-  /* 4 Stores */
-  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
-
-  BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
-
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
-
-  /* 4 Stores */
-  ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3);
-  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
-  ST_SH(reg0, (tmp_odd_buf + 13 * 8));
-  ST_SH(reg1, (tmp_odd_buf + 14 * 8));
-
-  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
-  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
-
-  /* Load 8 & Store 8 */
-  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
-  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
-
-  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Load 8 & Store 8 */
-  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
-  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
-
-  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
-}
-
-static void idct_butterfly_transpose_store(int16_t *tmp_buf,
-                                           int16_t *tmp_eve_buf,
-                                           int16_t *tmp_odd_buf, int16_t *dst) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
-  /* FINAL BUTTERFLY : Dependency on Even & Odd */
-  vec0 = LD_SH(tmp_odd_buf);
-  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
-  loc0 = LD_SH(tmp_eve_buf);
-  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 31 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 23 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 27 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 19 * 8));
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 29 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 21 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 25 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 17 * 8));
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 30 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 22 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 26 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 18 * 8));
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
-
-  ST_SH((loc0 - vec3), (tmp_buf + 28 * 8));
-  ST_SH((loc1 - vec2), (tmp_buf + 20 * 8));
-  ST_SH((loc2 - vec1), (tmp_buf + 24 * 8));
-  ST_SH((loc3 - vec0), (tmp_buf + 16 * 8));
-
-  /* Transpose : 16 vectors */
-  /* 1st & 2nd 8x8 */
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  ST_SH4(m0, n0, m1, n1, (dst + 0), 32);
-  ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32);
-
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH4(m4, n4, m5, n5, (dst + 8), 32);
-  ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32);
-
-  /* 3rd & 4th 8x8 */
-  LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
-  LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
-  TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
-                     n3);
-  ST_SH4(m0, n0, m1, n1, (dst + 16), 32);
-  ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32);
-
-  TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
-                     n7);
-  ST_SH4(m4, n4, m5, n5, (dst + 24), 32);
-  ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32);
-}
-
-static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
-  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
-  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
-  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
-
-  idct32x8_row_transpose_store(input, &tmp_buf[0]);
-  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
-  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
-  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
-                                 output);
-}
-
-static void idct8x32_column_even_process_store(int16_t *tmp_buf,
-                                               int16_t *tmp_eve_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
-
-  /* Even stage 1 */
-  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-  tmp_buf += (2 * 32);
-
-  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
-  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
-  BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
-  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-
-  loc1 = vec3;
-  loc0 = vec1;
-
-  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
-  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
-  BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
-  BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
-  BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
-
-  /* Even stage 2 */
-  /* Load 8 */
-  LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
-
-  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
-
-  vec0 = reg0 + reg4;
-  reg0 = reg0 - reg4;
-  reg4 = reg6 + reg2;
-  reg6 = reg6 - reg2;
-  reg2 = reg1 + reg5;
-  reg1 = reg1 - reg5;
-  reg5 = reg7 + reg3;
-  reg7 = reg7 - reg3;
-  reg3 = vec0;
-
-  vec1 = reg2;
-  reg2 = reg3 + reg4;
-  reg3 = reg3 - reg4;
-  reg4 = reg5 - vec1;
-  reg5 = reg5 + vec1;
-
-  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
-  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
-
-  vec0 = reg0 - reg6;
-  reg0 = reg0 + reg6;
-  vec1 = reg7 - reg1;
-  reg7 = reg7 + reg1;
-
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
-
-  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
-  /* Store 8 */
-  BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, tmp_eve_buf, 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8);
-
-  BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8);
-
-  /* Store 8 */
-  BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8);
-
-  BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
-  ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8);
-  ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8);
-}
-
-static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
-                                              int16_t *tmp_odd_buf) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
-
-  /* Odd stage 1 */
-  reg0 = LD_SH(tmp_buf + 32);
-  reg1 = LD_SH(tmp_buf + 7 * 32);
-  reg2 = LD_SH(tmp_buf + 9 * 32);
-  reg3 = LD_SH(tmp_buf + 15 * 32);
-  reg4 = LD_SH(tmp_buf + 17 * 32);
-  reg5 = LD_SH(tmp_buf + 23 * 32);
-  reg6 = LD_SH(tmp_buf + 25 * 32);
-  reg7 = LD_SH(tmp_buf + 31 * 32);
-
-  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
-
-  vec0 = reg0 + reg3;
-  reg0 = reg0 - reg3;
-  reg3 = reg7 + reg4;
-  reg7 = reg7 - reg4;
-  reg4 = reg1 + reg2;
-  reg1 = reg1 - reg2;
-  reg2 = reg6 + reg5;
-  reg6 = reg6 - reg5;
-  reg5 = vec0;
-
-  /* 4 Stores */
-  ADD2(reg5, reg4, reg3, reg2, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8);
-  SUB2(reg5, reg4, reg3, reg2, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
-  ST_SH2(vec0, vec1, tmp_odd_buf, 8);
-
-  /* 4 Stores */
-  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
-  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
-  BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8);
-  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
-  ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8);
-
-  /* Odd stage 2 */
-  /* 8 loads */
-  reg0 = LD_SH(tmp_buf + 3 * 32);
-  reg1 = LD_SH(tmp_buf + 5 * 32);
-  reg2 = LD_SH(tmp_buf + 11 * 32);
-  reg3 = LD_SH(tmp_buf + 13 * 32);
-  reg4 = LD_SH(tmp_buf + 19 * 32);
-  reg5 = LD_SH(tmp_buf + 21 * 32);
-  reg6 = LD_SH(tmp_buf + 27 * 32);
-  reg7 = LD_SH(tmp_buf + 29 * 32);
-
-  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
-  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
-  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
-  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
-
-  /* 4 Stores */
-  SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
-  BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8);
-  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
-  ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8);
-
-  /* 4 Stores */
-  ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3);
-  BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
-  ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8);
-  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
-  ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
-  /* Load 8 & Store 8 */
-  LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3);
-  LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8);
-
-  SUB2(reg0, reg4, reg1, reg5, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg2, reg6, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8);
-
-  /* Load 8 & Store 8 */
-  LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3);
-  LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7);
-
-  ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8);
-
-  SUB2(reg0, reg4, reg3, reg7, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
-
-  SUB2(reg1, reg5, reg2, reg6, vec0, vec1);
-  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
-  ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8);
-}
-
-static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
-                                             int16_t *tmp_odd_buf, uint8_t *dst,
-                                             int32_t dst_stride) {
-  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
-  v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7;
-
-  /* FINAL BUTTERFLY : Dependency on Even & Odd */
-  vec0 = LD_SH(tmp_odd_buf);
-  vec1 = LD_SH(tmp_odd_buf + 9 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 14 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 6 * 8);
-  loc0 = LD_SH(tmp_eve_buf);
-  loc1 = LD_SH(tmp_eve_buf + 8 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 4 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 12 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6);
-  SRARI_H4_SH(m0, m2, m4, m6, 6);
-  AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0);
-  SRARI_H4_SH(m0, m2, m4, m6, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4,
-                      m6);
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 4 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 13 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 10 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 3 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 2 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 10 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 6 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 14 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7);
-  SRARI_H4_SH(m1, m3, m5, m7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1);
-  SRARI_H4_SH(m1, m3, m5, m7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5,
-                      m7);
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 2 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 11 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 12 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 7 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 1 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 9 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 5 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 13 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6);
-  SRARI_H4_SH(n0, n2, n4, n6, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0);
-  SRARI_H4_SH(n0, n2, n4, n6, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4,
-                      n6);
-
-  /* Load 8 & Store 8 */
-  vec0 = LD_SH(tmp_odd_buf + 5 * 8);
-  vec1 = LD_SH(tmp_odd_buf + 15 * 8);
-  vec2 = LD_SH(tmp_odd_buf + 8 * 8);
-  vec3 = LD_SH(tmp_odd_buf + 1 * 8);
-  loc0 = LD_SH(tmp_eve_buf + 3 * 8);
-  loc1 = LD_SH(tmp_eve_buf + 11 * 8);
-  loc2 = LD_SH(tmp_eve_buf + 7 * 8);
-  loc3 = LD_SH(tmp_eve_buf + 15 * 8);
-
-  ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7);
-  SRARI_H4_SH(n1, n3, n5, n7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7);
-
-  SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1);
-  SRARI_H4_SH(n1, n3, n5, n7, 6);
-  AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5,
-                      n7);
-}
-
-static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                           int32_t dst_stride) {
-  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
-  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
-
-  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
-  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
-  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
-                                   dst_stride);
-}
-
-void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst,
-                                int32_t dst_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
-  int16_t *out_ptr = out_arr;
-
-  /* transform rows */
-  for (i = 0; i < 4; ++i) {
-    /* process 32 * 8 block */
-    idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
-  }
-
-  /* transform columns */
-  for (i = 0; i < 4; ++i) {
-    /* process 8 * 32 block */
-    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                   dst_stride);
-  }
-}
-
-void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst,
-                              int32_t dst_stride) {
-  int32_t i;
-  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
-  int16_t *out_ptr = out_arr;
-
-  for (i = 32; i--;) {
-    __asm__ __volatile__(
-        "sw     $zero,      0(%[out_ptr])     \n\t"
-        "sw     $zero,      4(%[out_ptr])     \n\t"
-        "sw     $zero,      8(%[out_ptr])     \n\t"
-        "sw     $zero,     12(%[out_ptr])     \n\t"
-        "sw     $zero,     16(%[out_ptr])     \n\t"
-        "sw     $zero,     20(%[out_ptr])     \n\t"
-        "sw     $zero,     24(%[out_ptr])     \n\t"
-        "sw     $zero,     28(%[out_ptr])     \n\t"
-        "sw     $zero,     32(%[out_ptr])     \n\t"
-        "sw     $zero,     36(%[out_ptr])     \n\t"
-        "sw     $zero,     40(%[out_ptr])     \n\t"
-        "sw     $zero,     44(%[out_ptr])     \n\t"
-        "sw     $zero,     48(%[out_ptr])     \n\t"
-        "sw     $zero,     52(%[out_ptr])     \n\t"
-        "sw     $zero,     56(%[out_ptr])     \n\t"
-        "sw     $zero,     60(%[out_ptr])     \n\t"
-
-        :
-        : [out_ptr] "r"(out_ptr));
-
-    out_ptr += 32;
-  }
-
-  out_ptr = out_arr;
-
-  /* rows: only upper-left 8x8 has non-zero coeff */
-  idct32x8_1d_rows_msa(input, out_ptr);
-
-  /* transform columns */
-  for (i = 0; i < 4; ++i) {
-    /* process 8 * 32 block */
-    idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)),
-                                   dst_stride);
-  }
-}
-
-void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst,
-                             int32_t dst_stride) {
-  int32_t i;
-  int16_t out;
-  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
-  v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO(out, 6);
-
-  vec = __msa_fill_h(out);
-
-  for (i = 16; i--;) {
-    LD_UB2(dst, 16, dst0, dst1);
-    LD_UB2(dst + dst_stride, 16, dst2, dst3);
-
-    UNPCK_UB_SH(dst0, res0, res4);
-    UNPCK_UB_SH(dst1, res1, res5);
-    UNPCK_UB_SH(dst2, res2, res6);
-    UNPCK_UB_SH(dst3, res3, res7);
-    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
-    ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7);
-    CLIP_SH4_0_255(res0, res1, res2, res3);
-    CLIP_SH4_0_255(res4, res5, res6, res7);
-    PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1,
-                tmp2, tmp3);
-
-    ST_UB2(tmp0, tmp1, dst, 16);
-    dst += dst_stride;
-    ST_UB2(tmp2, tmp3, dst, 16);
-    dst += dst_stride;
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c
deleted file mode 100644
index 274818baa..000000000
--- a/third_party/aom/aom_dsp/mips/idct4x4_msa.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3;
-  v4i32 in0_r, in1_r, in2_r, in3_r, in4_r;
-
-  /* load vector elements of 4x4 block */
-  LD4x4_SH(input, in0, in2, in3, in1);
-  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
-  UNPCK_R_SH_SW(in0, in0_r);
-  UNPCK_R_SH_SW(in2, in2_r);
-  UNPCK_R_SH_SW(in3, in3_r);
-  UNPCK_R_SH_SW(in1, in1_r);
-  SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT);
-
-  in0_r += in2_r;
-  in3_r -= in1_r;
-  in4_r = (in0_r - in3_r) >> 1;
-  in1_r = in4_r - in1_r;
-  in2_r = in4_r - in2_r;
-  in0_r -= in1_r;
-  in3_r += in2_r;
-
-  TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r);
-
-  in0_r += in1_r;
-  in2_r -= in3_r;
-  in4_r = (in0_r - in2_r) >> 1;
-  in3_r = in4_r - in3_r;
-  in1_r = in4_r - in1_r;
-  in0_r -= in3_r;
-  in2_r += in1_r;
-
-  PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1,
-              in2, in3);
-  ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride);
-}
-
-void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride) {
-  int16_t a1, e1;
-  v8i16 in1, in0 = { 0 };
-
-  a1 = input[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-
-  in0 = __msa_insert_h(in0, 0, a1);
-  in0 = __msa_insert_h(in0, 1, e1);
-  in0 = __msa_insert_h(in0, 2, e1);
-  in0 = __msa_insert_h(in0, 3, e1);
-
-  in1 = in0 >> 1;
-  in0 -= in1;
-
-  ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride);
-}
-
-void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3;
-
-  /* load vector elements of 4x4 block */
-  LD4x4_SH(input, in0, in1, in2, in3);
-  /* rows */
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-  /* columns */
-  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-  AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
-  /* rounding (add 2^3, divide by 2^4) */
-  SRARI_H4_SH(in0, in1, in2, in3, 4);
-  ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride);
-}
-
-void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride) {
-  int16_t out;
-  v8i16 vec;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO(out, 4);
-  vec = __msa_fill_h(out);
-
-  ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c
deleted file mode 100644
index 981c103cd..000000000
--- a/third_party/aom/aom_dsp/mips/idct8x8_msa.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/inv_txfm_msa.h"
-
-void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-
-  /* load vector elements of 8x8 block */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-
-  /* rows transform */
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  /* 1D idct8x8 */
-  AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                 in4, in5, in6, in7);
-  /* columns transform */
-  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  /* 1D idct8x8 */
-  AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                 in4, in5, in6, in7);
-  /* final rounding (add 2^4, divide by 2^5) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 5);
-  SRARI_H4_SH(in4, in5, in6, in7, 5);
-  /* add block and store 8x8 */
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
-
-void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst,
-                            int32_t dst_stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3;
-  v4i32 tmp0, tmp1, tmp2, tmp3;
-  v8i16 zero = { 0 };
-
-  /* load vector elements of 8x8 block */
-  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
-  TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
-
-  /* stage1 */
-  ILVL_H2_SH(in3, in0, in2, in1, s0, s1);
-  k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
-  k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
-  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
-  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
-  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
-  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
-  BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5);
-
-  /* stage2 */
-  ILVR_H2_SH(in3, in1, in2, in0, s1, s0);
-  k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
-  k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
-  k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
-  k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
-  DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3);
-  SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS);
-  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1);
-  PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3);
-  BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3);
-
-  /* stage3 */
-  s0 = __msa_ilvr_h(s6, s5);
-
-  k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
-  DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1);
-  SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS);
-  PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3);
-
-  /* stage4 */
-  BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6,
-              in7);
-  TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                     in4, in5, in6, in7);
-  AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                 in4, in5, in6, in7);
-
-  /* final rounding (add 2^4, divide by 2^5) and shift */
-  SRARI_H4_SH(in0, in1, in2, in3, 5);
-  SRARI_H4_SH(in4, in5, in6, in7, 5);
-
-  /* add block and store 8x8 */
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7);
-}
-
-void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst,
-                           int32_t dst_stride) {
-  int16_t out;
-  int32_t val;
-  v8i16 vec;
-
-  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
-  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
-  val = ROUND_POWER_OF_TWO(out, 5);
-  vec = __msa_fill_h(val);
-
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
-  dst += (4 * dst_stride);
-  AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec);
-}
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
index bcb9c9df9..9f25cc1ca 100644
--- a/third_party/aom/aom_dsp/mips/intrapred_msa.c
+++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
deleted file mode 100644
index c69835173..000000000
--- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
-#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/mips/common_dspr2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* Note: this macro expects a local int32_t named out to exist, and will write
- * to that variable. */
-#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
-  ({                                                                           \
-                                                                               \
-    int32_t tmp;                                                               \
-    int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
-    int in = input;                                                            \
-                                                                               \
-    __asm__ __volatile__(/* out = dct_const_round_shift(dc *  cospi_16_64); */ \
-                         "mtlo     %[dct_cost_rounding],   $ac1              " \
-                         "                \n\t"                                \
-                         "mthi     $zero,                  $ac1              " \
-                         "                \n\t"                                \
-                         "madd     $ac1,                   %[in],            " \
-                         "%[cospi_16_64]  \n\t"                                \
-                         "extp     %[tmp],                 $ac1,             " \
-                         "31              \n\t"                                \
-                                                                               \
-                         /* out = dct_const_round_shift(out * cospi_16_64); */ \
-                         "mtlo     %[dct_cost_rounding],   $ac2              " \
-                         "                \n\t"                                \
-                         "mthi     $zero,                  $ac2              " \
-                         "                \n\t"                                \
-                         "madd     $ac2,                   %[tmp],           " \
-                         "%[cospi_16_64]  \n\t"                                \
-                         "extp     %[out],                 $ac2,             " \
-                         "31              \n\t"                                \
-                                                                               \
-                         : [tmp] "=&r"(tmp), [out] "=r"(out)                   \
-                         : [in] "r"(in),                                       \
-                           [dct_cost_rounding] "r"(dct_cost_rounding),         \
-                           [cospi_16_64] "r"(cospi_16_64));                    \
-    out;                                                                       \
-  })
-
-void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride);
-void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output);
-void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride);
-void iadst4_dspr2(const int16_t *input, int16_t *output);
-void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride);
-void iadst8_dspr2(const int16_t *input, int16_t *output);
-void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows);
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride);
-void iadst16_dspr2(const int16_t *input, int16_t *output);
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
deleted file mode 100644
index 122667aa8..000000000
--- a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_
-#define AOM_DSP_MIPS_INV_TXFM_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/mips/txfm_macros_msa.h"
-#include "aom_dsp/txfm_common.h"
-
-#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,  \
-                  out3, out4, out5, out6, out7)                              \
-  {                                                                          \
-    v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m;                       \
-    v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m;                        \
-    v8i16 coeff0_m = { cospi_2_64,  cospi_6_64,  cospi_10_64, cospi_14_64,   \
-                       cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
-    v8i16 coeff1_m = { cospi_8_64,  -cospi_8_64,  cospi_16_64, -cospi_16_64, \
-                       cospi_24_64, -cospi_24_64, 0,           0 };          \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in0, in7, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in4, in3, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in7, in0, in4, in3);             \
-                                                                             \
-    SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m);                          \
-    cnst2_m = -cnst0_m;                                                      \
-    ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m);       \
-    SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m);                          \
-    cnst4_m = -cnst2_m;                                                      \
-    ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m);       \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-                                                                             \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m,  \
-                          cnst2_m, cnst3_m, in5, in2, in6, in1);             \
-    BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5);                   \
-    out7 = -s0_m;                                                            \
-    out0 = s1_m;                                                             \
-                                                                             \
-    SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m);  \
-                                                                             \
-    ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m);       \
-    cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-    cnst1_m = cnst0_m;                                                       \
-                                                                             \
-    ILVRL_H2_SH(in4, in3, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(in6, in1, vec3_m, vec2_m);                                   \
-    DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m,  \
-                          cnst3_m, cnst1_m, out1, out6, s0_m, s1_m);         \
-                                                                             \
-    SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                          \
-    cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                               \
-                                                                             \
-    ILVRL_H2_SH(in2, in5, vec1_m, vec0_m);                                   \
-    ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m);                                 \
-    out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);                   \
-    out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);                   \
-    out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);                   \
-    out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);                   \
-                                                                             \
-    out1 = -out1;                                                            \
-    out3 = -out3;                                                            \
-    out5 = -out5;                                                            \
-  }
-
-#define AOM_SET_COSPI_PAIR(c0_h, c1_h)  \
-  ({                                    \
-    v8i16 out0_m, r0_m, r1_m;           \
-                                        \
-    r0_m = __msa_fill_h(c0_h);          \
-    r1_m = __msa_fill_h(c1_h);          \
-    out0_m = __msa_ilvev_h(r1_m, r0_m); \
-                                        \
-    out0_m;                             \
-  })
-
-#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3)               \
-  {                                                                            \
-    uint8_t *dst_m = (uint8_t *)(dst);                                         \
-    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                                      \
-    v16i8 tmp0_m, tmp1_m;                                                      \
-    v16i8 zero_m = { 0 };                                                      \
-    v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
-                                                                               \
-    LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m);                 \
-    ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \
-               res0_m, res1_m, res2_m, res3_m);                                \
-    ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m,   \
-         res2_m, res3_m);                                                      \
-    CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m);                            \
-    PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m);               \
-    ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride);                               \
-  }
-
-#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3)             \
-  {                                                                         \
-    v8i16 c0_m, c1_m, c2_m, c3_m;                                           \
-    v8i16 step0_m, step1_m;                                                 \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                    \
-    c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                   \
-    step0_m = __msa_ilvr_h(in2, in0);                                       \
-    DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m);              \
-                                                                            \
-    c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                    \
-    c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                     \
-    step1_m = __msa_ilvr_h(in3, in1);                                       \
-    DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m);              \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);            \
-                                                                            \
-    PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m);            \
-    SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8);                        \
-    BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \
-                out0, out1, out2, out3);                                    \
-  }
-
-#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3)       \
-  {                                                                    \
-    v8i16 res0_m, res1_m, c0_m, c1_m;                                  \
-    v8i16 k1_m, k2_m, k3_m, k4_m;                                      \
-    v8i16 zero_m = { 0 };                                              \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
-    v4i32 int0_m, int1_m, int2_m, int3_m;                              \
-    v8i16 mask_m = { sinpi_1_9,  sinpi_2_9,  sinpi_3_9,  sinpi_4_9,    \
-                     -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \
-                                                                       \
-    SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m);          \
-    ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m);                   \
-    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
-    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m);           \
-    int0_m = tmp2_m + tmp1_m;                                          \
-                                                                       \
-    SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m);                            \
-    ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m);                   \
-    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
-    int1_m = tmp0_m + tmp1_m;                                          \
-                                                                       \
-    c0_m = __msa_splati_h(mask_m, 6);                                  \
-    ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m);                  \
-    ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m);                    \
-    DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m);           \
-    int2_m = tmp0_m + tmp1_m;                                          \
-                                                                       \
-    c0_m = __msa_splati_h(mask_m, 6);                                  \
-    c0_m = __msa_ilvev_h(c0_m, k1_m);                                  \
-                                                                       \
-    res0_m = __msa_ilvr_h((in1), (in3));                               \
-    tmp0_m = __msa_dotp_s_w(res0_m, c0_m);                             \
-    int3_m = tmp2_m + tmp0_m;                                          \
-                                                                       \
-    res0_m = __msa_ilvr_h((in2), (in3));                               \
-    c1_m = __msa_ilvev_h(k4_m, k3_m);                                  \
-                                                                       \
-    tmp2_m = __msa_dotp_s_w(res0_m, c1_m);                             \
-    res1_m = __msa_ilvr_h((in0), (in2));                               \
-    c1_m = __msa_ilvev_h(k1_m, zero_m);                                \
-                                                                       \
-    tmp3_m = __msa_dotp_s_w(res1_m, c1_m);                             \
-    int3_m += tmp2_m;                                                  \
-    int3_m += tmp3_m;                                                  \
-                                                                       \
-    SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS);       \
-    PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1);           \
-    PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3);           \
-  }
-
-#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h)    \
-  ({                                                  \
-    v8i16 c0_m, c1_m;                                 \
-                                                      \
-    SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \
-    c0_m = __msa_ilvev_h(c1_m, c0_m);                 \
-                                                      \
-    c0_m;                                             \
-  })
-
-/* multiply and add macro */
-#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,  \
-                 out2, out3)                                                  \
-  {                                                                           \
-    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd;                         \
-                                                                              \
-    ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m);                            \
-    ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m);                            \
-    DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \
-                cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
-    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
-    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1);      \
-    DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \
-                cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd);            \
-    SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS);  \
-    PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3);      \
-  }
-
-/* idct 8x8 macro */
-#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,    \
-                       out2, out3, out4, out5, out6, out7)                    \
-  {                                                                           \
-    v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m;             \
-    v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m;             \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
-    v8i16 mask_m = { cospi_28_64, cospi_4_64,  cospi_20_64,  cospi_12_64,     \
-                     cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 };  \
-                                                                              \
-    k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5);                                  \
-    k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0);                                  \
-    k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3);                                  \
-    k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2);                                  \
-    AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \
-    SUB2(in1, in3, in7, in5, res0_m, res1_m);                                 \
-    k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7);                                  \
-    k1_m = __msa_splati_h(mask_m, 4);                                         \
-                                                                              \
-    ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m);                              \
-    DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m,       \
-                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
-    SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
-    tp4_m = in1 + in3;                                                        \
-    PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m);                \
-    tp7_m = in7 + in5;                                                        \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                      \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                       \
-    AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \
-    BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m);              \
-    BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \
-                out1, out2, out3, out4, out5, out6, out7);                    \
-  }
-
-#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
-                        out2, out3, out4, out5, out6, out7)                   \
-  {                                                                           \
-    v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m;                     \
-    v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m;                                 \
-    v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1;           \
-    v8i16 mask1_m = { cospi_2_64,  cospi_30_64,  -cospi_2_64, cospi_10_64,    \
-                      cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 };  \
-    v8i16 mask2_m = { cospi_14_64,  -cospi_18_64, cospi_26_64, cospi_6_64,    \
-                      -cospi_26_64, cospi_8_64,   cospi_24_64, -cospi_8_64 }; \
-    v8i16 mask3_m = {                                                         \
-      -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0         \
-    };                                                                        \
-                                                                              \
-    k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2);                                 \
-    ILVRL_H2_SH(in1, in0, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
-                r1_m, r2_m, r3_m);                                            \
-    k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1);                                 \
-    ILVRL_H2_SH(in5, in4, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
-                r5_m, r6_m, r7_m);                                            \
-    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m);                      \
-    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m);                          \
-    k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5);                                 \
-    ILVRL_H2_SH(in3, in2, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
-                r1_m, r2_m, r3_m);                                            \
-    k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4);                                 \
-    ILVRL_H2_SH(in7, in6, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m,     \
-                r5_m, r6_m, r7_m);                                            \
-    ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m);                      \
-    SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m);                          \
-    ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m);                                      \
-    BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3);         \
-    k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7);                                 \
-    ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0);                                    \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m,     \
-                r1_m, r2_m, r3_m);                                            \
-    k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1);                                 \
-    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m,   \
-                r6_m, r7_m);                                                  \
-    ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6);                           \
-    SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m,    \
-         m3_m);                                                               \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5);                            \
-    k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2);                                 \
-    k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3);                                 \
-    ILVRL_H2_SH(in4, in3, in_s1, in_s0);                                      \
-    DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m,     \
-                m1_m, m2_m, m3_m);                                            \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4);                           \
-    ILVRL_H2_SW(in5, in2, m2_m, m3_m);                                        \
-    DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m,   \
-                m2_m, m3_m);                                                  \
-    SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS);                      \
-    PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5);                           \
-                                                                              \
-    out1 = -in1;                                                              \
-    out3 = -in3;                                                              \
-    out5 = -in5;                                                              \
-    out7 = -in7;                                                              \
-  }
-
-#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11,     \
-                         r12, r13, r14, r15, out0, out1, out2, out3, out4,     \
-                         out5, out6, out7, out8, out9, out10, out11, out12,    \
-                         out13, out14, out15)                                  \
-  {                                                                            \
-    v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;                      \
-    v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;                \
-    v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;                      \
-    v8i16 h8_m, h9_m, h10_m, h11_m;                                            \
-    v8i16 k0_m, k1_m, k2_m, k3_m;                                              \
-                                                                               \
-    /* stage 1 */                                                              \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64);                       \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);                      \
-    MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m);  \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64);                       \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);                      \
-    MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64);                        \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);                       \
-    MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m,       \
-            g11_m);                                                            \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64);                       \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);                      \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64);                        \
-    k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);                       \
-    MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m,      \
-            g15_m);                                                            \
-                                                                               \
-    /* stage 2 */                                                              \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);                       \
-    MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \
-            h3_m);                                                             \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);                       \
-    k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);                      \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);                      \
-    MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m,      \
-            h6_m, h7_m);                                                       \
-    BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);             \
-    BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \
-                h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);                         \
-                                                                               \
-    /* stage 3 */                                                              \
-    BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);           \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                        \
-    k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                       \
-    k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);                       \
-    MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5,  \
-            out7);                                                             \
-    MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14,      \
-            out13, out15);                                                     \
-                                                                               \
-    /* stage 4 */                                                              \
-    k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                       \
-    k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);                     \
-    k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                      \
-    k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);                      \
-    MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                          \
-    MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                            \
-    MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);                        \
-    MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);                        \
-  }
-
-void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                      int32_t dst_stride);
-void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
-void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
-                                       int32_t dst_stride);
-void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
-#endif  // AOM_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
deleted file mode 100644
index c63b1e857..000000000
--- a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c
+++ /dev/null
@@ -1,1190 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void idct16_rows_dspr2(const int16_t *input, int16_t *output,
-                       uint32_t no_rows) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_10, step1_11, step1_12, step1_13;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-
-  for (i = no_rows; i--;) {
-    /* prefetch row */
-    prefetch_load((const uint8_t *)(input + 16));
-
-    __asm__ __volatile__(
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
-          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
-          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_2_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_30_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
-          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
-          [step2_14] "=r"(step2_14)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac2,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             4(%[input])                     \n\t"
-        "lh       %[load6],             28(%[input])                    \n\t"
-        "lh       %[load7],             20(%[input])                    \n\t"
-        "lh       %[load8],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_4_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_12_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_20_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_12_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_4_64]   \n\t"
-        "madd     $ac2,                 %[load6],       %[cospi_28_64]  \n\t"
-        "extp     %[result4],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
-          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
-          [step1_13] "=r"(step1_13)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
-          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
-          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
-          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
-          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "add      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_12]     \n\t"
-        "add      %[load5],             %[load5],       %[step2_15]     \n\t"
-        "add      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_13]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_14]     \n\t"
-        "sh       %[load5],             0(%[output])                    \n\t"
-        "sh       %[load6],             32(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-        "sub      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-        "add      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "sh       %[load5],             192(%[output])                  \n\t"
-        "sh       %[load6],             224(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_11]     \n\t"
-        "sub      %[load6],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_9]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "sh       %[load5],             256(%[output])                  \n\t"
-        "sh       %[load6],             288(%[output])                  \n\t"
-        "add      %[load5],             %[step1_1],     %[step1_6]      \n\t"
-        "sub      %[load5],             %[load5],       %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_14]     \n\t"
-        "add      %[load6],             %[step1_0],     %[step1_7]      \n\t"
-        "sub      %[load6],             %[load6],       %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_15]     \n\t"
-        "sh       %[load5],             448(%[output])                  \n\t"
-        "sh       %[load6],             480(%[output])                  \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6)
-        : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1),
-          [step1_6] "r"(step1_6), [step1_7] "r"(step1_7),
-          [step2_8] "r"(step2_8), [step2_9] "r"(step2_9),
-          [step2_10] "r"(step2_10), [step2_11] "r"(step2_11),
-          [step2_12] "r"(step2_12), [step2_13] "r"(step2_13),
-          [step2_14] "r"(step2_14), [step2_15] "r"(step2_15));
-
-    __asm__ __volatile__(
-        "add      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_13]     \n\t"
-        "add      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_12]     \n\t"
-        "sh       %[load5],             64(%[output])                   \n\t"
-        "sh       %[load6],             96(%[output])                   \n\t"
-        "sub      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "add      %[load5],             %[load5],       %[step1_11]     \n\t"
-        "sub      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "add      %[load6],             %[load6],       %[step1_10]     \n\t"
-        "sh       %[load5],             128(%[output])                  \n\t"
-        "sh       %[load6],             160(%[output])                  \n\t"
-        "sub      %[load5],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_10]     \n\t"
-        "sub      %[load6],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_11]     \n\t"
-        "sh       %[load5],             320(%[output])                  \n\t"
-        "sh       %[load6],             352(%[output])                  \n\t"
-        "add      %[load5],             %[step1_3],     %[step1_4]      \n\t"
-        "sub      %[load5],             %[load5],       %[step1_12]     \n\t"
-        "add      %[load6],             %[step1_2],     %[step1_5]      \n\t"
-        "sub      %[load6],             %[load6],       %[step1_13]     \n\t"
-        "sh       %[load5],             384(%[output])                  \n\t"
-        "sh       %[load6],             416(%[output])                  \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6)
-        : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
-          [step1_4] "r"(step1_4), [step1_5] "r"(step1_5),
-          [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
-          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13));
-
-    input += 16;
-    output += 1;
-  }
-}
-
-void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) {
-  int i;
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int step1_8, step1_9, step1_10, step1_11;
-  int step1_12, step1_13, step1_14, step1_15;
-  int step2_0, step2_1, step2_2, step2_3;
-  int step2_8, step2_9, step2_10, step2_11;
-  int step2_12, step2_13, step2_14, step2_15;
-  int load1, load2, load3, load4, load5, load6, load7, load8;
-  int result1, result2, result3, result4;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 16; ++i) {
-    dest_pix = (dest + i);
-    __asm__ __volatile__(
-        "lh       %[load1],              0(%[input])                    \n\t"
-        "lh       %[load2],             16(%[input])                    \n\t"
-        "lh       %[load3],              8(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[step2_0],           $ac1,           31              \n\t"
-        "extp     %[step2_1],           $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[step2_3],           $ac1,           31              \n\t"
-
-        "add      %[step1_0],           %[step2_0],     %[step2_3]      \n\t"
-        "add      %[step1_1],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_2],           %[step2_1],     %[step2_2]      \n\t"
-        "sub      %[step1_3],           %[step2_0],     %[step2_3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [step2_0] "=&r"(step2_0),
-          [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2),
-          [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             2(%[input])                     \n\t"
-        "lh       %[load6],             30(%[input])                    \n\t"
-        "lh       %[load7],             18(%[input])                    \n\t"
-        "lh       %[load8],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load6],       %[cospi_2_64]   \n\t"
-        "extp     %[result1],           $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load7],       %[cospi_14_64]  \n\t"
-        "msub     $ac3,                 %[load8],       %[cospi_18_64]  \n\t"
-        "extp     %[result2],           $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load7],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load8],       %[cospi_14_64]  \n\t"
-        "extp     %[result3],           $ac1,           31              \n\t"
-
-        "madd     $ac2,                 %[load5],        %[cospi_2_64]  \n\t"
-        "madd     $ac2,                 %[load6],        %[cospi_30_64] \n\t"
-        "extp     %[result4],           $ac2,            31             \n\t"
-
-        "sub      %[load5],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[result4],     %[result3]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load5],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load5],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[result1],     %[result2]      \n\t"
-        "add      %[step2_15],          %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_8] "=r"(step2_8),
-          [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9),
-          [step2_14] "=r"(step2_14)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             22(%[input])                    \n\t"
-        "lh       %[load3],             26(%[input])                    \n\t"
-        "lh       %[load4],             6(%[input])                     \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_22_64]     \n\t"
-        "msub     $ac1,                 %[load2],    %[cospi_10_64]     \n\t"
-        "extp     %[result1],           $ac1,        31                 \n\t"
-
-        "madd     $ac3,                 %[load3],    %[cospi_6_64]      \n\t"
-        "msub     $ac3,                 %[load4],    %[cospi_26_64]     \n\t"
-        "extp     %[result2],           $ac3,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac1,                 %[load1],    %[cospi_10_64]     \n\t"
-        "madd     $ac1,                 %[load2],    %[cospi_22_64]     \n\t"
-        "extp     %[result3],           $ac1,        31                 \n\t"
-
-        "madd     $ac2,                 %[load3],    %[cospi_26_64]     \n\t"
-        "madd     $ac2,                 %[load4],    %[cospi_6_64]      \n\t"
-        "extp     %[result4],           $ac2,        31                 \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[result2],     %[result1]      \n\t"
-        "sub      %[load2],             %[result4],     %[result3]      \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[result1],     %[result2]      \n\t"
-        "add      %[step2_12],          %[result4],     %[result3]      \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64));
-
-    __asm__ __volatile__(
-        "lh       %[load5],             4(%[input])                   \n\t"
-        "lh       %[load6],             28(%[input])                  \n\t"
-        "lh       %[load7],             20(%[input])                  \n\t"
-        "lh       %[load8],             12(%[input])                  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                          \n\t"
-        "mthi     $zero,                $ac3                          \n\t"
-
-        "madd     $ac1,                 %[load5],    %[cospi_28_64]   \n\t"
-        "msub     $ac1,                 %[load6],    %[cospi_4_64]    \n\t"
-        "extp     %[result1],           $ac1,        31               \n\t"
-
-        "madd     $ac3,                 %[load7],    %[cospi_12_64]   \n\t"
-        "msub     $ac3,                 %[load8],    %[cospi_20_64]   \n\t"
-        "extp     %[result2],           $ac3,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                          \n\t"
-        "mthi     $zero,                $ac1                          \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                          \n\t"
-        "mthi     $zero,                $ac2                          \n\t"
-
-        "madd     $ac1,                 %[load7],    %[cospi_20_64]   \n\t"
-        "madd     $ac1,                 %[load8],    %[cospi_12_64]   \n\t"
-        "extp     %[result3],           $ac1,        31               \n\t"
-
-        "madd     $ac2,                 %[load5],    %[cospi_4_64]    \n\t"
-        "madd     $ac2,                 %[load6],    %[cospi_28_64]   \n\t"
-        "extp     %[result4],           $ac2,        31               \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[result4],     %[result3]      \n\t"
-        "sub      %[load5],             %[load5],       %[result1]      \n\t"
-        "add      %[load5],             %[load5],       %[result2]      \n\t"
-
-        "sub      %[load6],             %[result1],     %[result2]      \n\t"
-        "sub      %[load6],             %[load6],       %[result3]      \n\t"
-        "add      %[load6],             %[load6],       %[result4]      \n\t"
-
-        "madd     $ac1,                 %[load5],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-
-        "add      %[step1_4],           %[result1],     %[result2]      \n\t"
-        "add      %[step1_7],           %[result4],     %[result3]      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [result3] "=&r"(result3),
-          [result4] "=&r"(result4), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "sub      %[load5],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_9]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_10]     \n\t"
-
-        "madd     $ac0,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_10]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_9]      \n\t"
-
-        "madd     $ac1,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load5],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load5],             %[load5],       %[step2_8]      \n\t"
-        "add      %[load5],             %[load5],       %[step2_11]     \n\t"
-
-        "madd     $ac2,                 %[load5],       %[cospi_16_64]  \n\t"
-
-        "sub      %[load6],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[load6],             %[load6],       %[step2_11]     \n\t"
-        "add      %[load6],             %[load6],       %[step2_8]      \n\t"
-
-        "madd     $ac3,                 %[load6],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_10],          $ac0,           31              \n\t"
-        "extp     %[step1_13],          $ac1,           31              \n\t"
-        "extp     %[step1_11],          $ac2,           31              \n\t"
-        "extp     %[step1_12],          $ac3,           31              \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10),
-          [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12),
-          [step1_13] "=r"(step1_13)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14),
-          [step2_13] "r"(step2_13), [step2_9] "r"(step2_9),
-          [step2_10] "r"(step2_10), [step2_15] "r"(step2_15),
-          [step2_12] "r"(step2_12), [step2_8] "r"(step2_8),
-          [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64));
-
-    step1_8 = step2_8 + step2_11;
-    step1_9 = step2_9 + step2_10;
-    step1_14 = step2_13 + step2_14;
-    step1_15 = step2_12 + step2_15;
-
-    __asm__ __volatile__(
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_15]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_14]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_13]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_12]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "add      %[load5],         %[load5],           %[step1_11]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_10]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "sub      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[load5],           %[step1_9]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "add      %[load6],         %[load6],           %[step1_8]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_8]      \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_9]      \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "sub      %[load5],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_10]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "sub      %[load6],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_11]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_3],         %[step1_4]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_12]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_2],         %[step1_5]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_13]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[load7],         0(%[dest_pix])                      \n\t"
-        "add      %[load5],         %[step1_1],         %[step1_6]      \n\t"
-        "sub      %[load5],         %[load5],           %[step1_14]     \n\t"
-        "addi     %[load5],         %[load5],           32              \n\t"
-        "sra      %[load5],         %[load5],           6               \n\t"
-        "add      %[load7],         %[load7],           %[load5]        \n\t"
-        "lbux     %[load5],         %[load7](%[cm])                     \n\t"
-        "add      %[load6],         %[step1_0],         %[step1_7]      \n\t"
-        "sub      %[load6],         %[load6],           %[step1_15]     \n\t"
-        "sb       %[load5],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[load8],         0(%[dest_pix])                      \n\t"
-        "addi     %[load6],         %[load6],           32              \n\t"
-        "sra      %[load6],         %[load6],           6               \n\t"
-        "add      %[load8],         %[load8],           %[load6]        \n\t"
-        "lbux     %[load6],         %[load8](%[cm])                     \n\t"
-        "sb       %[load6],         0(%[dest_pix])                      \n\t"
-
-        : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7),
-          [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix)
-        :
-        [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
-        [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3),
-        [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
-        [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9),
-        [step1_10] "r"(step1_10), [step1_11] "r"(step1_11),
-        [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
-        [step1_14] "r"(step1_14), [step1_15] "r"(step1_15));
-
-    input += 16;
-  }
-}
-
-void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows
-  idct16_rows_dspr2(input, out, 16);
-
-  // Then transform columns and add to dest
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
-  int16_t *outptr = out;
-  uint32_t i;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  idct16_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-  for (i = 0; i < 6; ++i) {
-    __asm__ __volatile__(
-        "sw     $zero,    0(%[outptr])     \n\t"
-        "sw     $zero,   32(%[outptr])     \n\t"
-        "sw     $zero,   64(%[outptr])     \n\t"
-        "sw     $zero,   96(%[outptr])     \n\t"
-        "sw     $zero,  128(%[outptr])     \n\t"
-        "sw     $zero,  160(%[outptr])     \n\t"
-        "sw     $zero,  192(%[outptr])     \n\t"
-        "sw     $zero,  224(%[outptr])     \n\t"
-        "sw     $zero,  256(%[outptr])     \n\t"
-        "sw     $zero,  288(%[outptr])     \n\t"
-        "sw     $zero,  320(%[outptr])     \n\t"
-        "sw     $zero,  352(%[outptr])     \n\t"
-        "sw     $zero,  384(%[outptr])     \n\t"
-        "sw     $zero,  416(%[outptr])     \n\t"
-        "sw     $zero,  448(%[outptr])     \n\t"
-        "sw     $zero,  480(%[outptr])     \n\t"
-
-        :
-        : [outptr] "r"(outptr));
-
-    outptr += 2;
-  }
-
-  // Then transform columns
-  idct16_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t vector_a1;
-  int32_t t1, t2, t3, t4;
-  int32_t vector_1, vector_2, vector_3, vector_4;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__(
-      "addi     %[out],     %[out],     32      \n\t"
-      "sra      %[a1],      %[out],     6       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 16; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-
-void iadst16_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8 = dct_const_round_shift(s0 - s8);
-  x9 = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
-
-  output[0] = x0;
-  output[1] = -x8;
-  output[2] = x12;
-  output[3] = -x4;
-  output[4] = x6;
-  output[5] = x14;
-  output[6] = x10;
-  output[7] = x2;
-  output[8] = x3;
-  output[9] = x11;
-  output[10] = x15;
-  output[11] = x7;
-  output[12] = x5;
-  output[13] = -x13;
-  output[14] = x9;
-  output[15] = -x1;
-}
-
-#endif  // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
deleted file mode 100644
index d469d1ad0..000000000
--- a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c
+++ /dev/null
@@ -1,1042 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                   int dest_stride) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
-  int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
-  int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
-  int16_t step3_28, step3_29, step3_30, step3_31;
-  int temp0, temp1, temp2, temp3;
-  int load1, load2, load3, load4;
-  int result1, result2;
-  int i, temp21;
-  uint8_t *dest_pix, *dest_pix1;
-  const int const_2_power_13 = 8192;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 32; ++i) {
-    dest_pix = dest + i;
-    dest_pix1 = dest + i + 31 * dest_stride;
-
-    __asm__ __volatile__(
-        "lh       %[load1],             2(%[input])                     \n\t"
-        "lh       %[load2],             62(%[input])                    \n\t"
-        "lh       %[load3],             34(%[input])                    \n\t"
-        "lh       %[load4],             30(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_17],          $ac1,           31              \n\t"
-        "extp     %[step1_30],          $ac3,           31              \n\t"
-        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
-          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             18(%[input])                    \n\t"
-        "lh       %[load2],             46(%[input])                    \n\t"
-        "lh       %[load3],             50(%[input])                    \n\t"
-        "lh       %[load4],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_18],          $ac1,           31              \n\t"
-        "extp     %[step1_29],          $ac3,           31              \n\t"
-        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
-          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             54(%[input])                    \n\t"
-        "lh       %[load3],             42(%[input])                    \n\t"
-        "lh       %[load4],             22(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
-        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
-
-        "extp     %[step1_21],          $ac1,           31              \n\t"
-        "extp     %[step1_26],          $ac3,           31              \n\t"
-        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
-          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             26(%[input])                    \n\t"
-        "lh       %[load2],             38(%[input])                    \n\t"
-        "lh       %[load3],             58(%[input])                    \n\t"
-        "lh       %[load4],              6(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_22],          $ac1,           31              \n\t"
-        "extp     %[step1_25],          $ac3,           31              \n\t"
-        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
-          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],              4(%[input])                    \n\t"
-        "lh       %[load2],             60(%[input])                    \n\t"
-        "lh       %[load3],             36(%[input])                    \n\t"
-        "lh       %[load4],             28(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             20(%[input])                    \n\t"
-        "lh       %[load2],             44(%[input])                    \n\t"
-        "lh       %[load3],             52(%[input])                    \n\t"
-        "lh       %[load4],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
-        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
-        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
-        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
-
-        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
-        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
-        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
-        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
-        "extp     %[step3_10],          $ac0,           31              \n\t"
-        "extp     %[step3_13],          $ac1,           31              \n\t"
-        "extp     %[step3_11],          $ac2,           31              \n\t"
-        "extp     %[step3_12],          $ac3,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
-          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
-          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
-          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
-          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
-
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
-
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
-
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
-
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
-
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
-
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
-
-    __asm__ __volatile__(
-        "lh       %[load1],             0(%[input])                     \n\t"
-        "lh       %[load2],             32(%[input])                    \n\t"
-        "lh       %[load3],             16(%[input])                    \n\t"
-        "lh       %[load4],             48(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[temp2],             $ac3,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[temp3],             $ac1,           31              \n\t"
-        "add      %[step1_0],           %[temp0],       %[temp3]        \n\t"
-        "add      %[step1_1],           %[temp1],       %[temp2]        \n\t"
-        "sub      %[step1_2],           %[temp1],       %[temp2]        \n\t"
-        "sub      %[step1_3],           %[temp0],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             8(%[input])                     \n\t"
-        "lh       %[load2],             56(%[input])                    \n\t"
-        "lh       %[load3],             40(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
-        "add      %[load1],             %[load1],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
-        "add      %[load2],             %[load2],       %[temp3]        \n\t"
-        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
-
-    // stage 7
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_20],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_22],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_0],         %[step2_31]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_1],         %[step2_30]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_2],         %[step2_29]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_3],         %[step2_28]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0),
-          [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
-          [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
-          [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
-          [step2_31] "r"(step2_31));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_4],         %[step1_27]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_5],         %[step1_26]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_6],         %[step1_25]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_7],         %[step1_24]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4),
-          [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
-          [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
-          [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
-          [step1_27] "r"(step1_27));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_8],         %[step1_23]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_9],         %[step1_22]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_10],        %[step1_21]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_11],        %[step1_20]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8),
-          [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
-          [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
-          [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
-          [step1_23] "r"(step1_23));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_12],        %[step2_19]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_13],        %[step2_18]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix])                      \n\t"
-        "add      %[temp0],         %[step1_14],        %[step2_17]     \n\t"
-        "addi     %[temp0],         %[temp0],           32              \n\t"
-        "sra      %[temp0],         %[temp0],           6               \n\t"
-        "add      %[temp2],         %[temp2],           %[temp0]        \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "add      %[temp1],         %[step1_15],        %[step2_16]     \n\t"
-        "sb       %[temp0],         0(%[dest_pix])                      \n\t"
-        "addu     %[dest_pix],      %[dest_pix],        %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix])                      \n\t"
-        "addi     %[temp1],         %[temp1],           32              \n\t"
-        "sra      %[temp1],         %[temp1],           6               \n\t"
-        "add      %[temp3],         %[temp3],           %[temp1]        \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix])                      \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step1_12] "r"(step1_12), [step1_13] "r"(step1_13),
-          [step1_14] "r"(step1_14), [step1_15] "r"(step1_15),
-          [step2_16] "r"(step2_16), [step2_17] "r"(step2_17),
-          [step2_18] "r"(step2_18), [step2_19] "r"(step2_19));
-
-    step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
-    step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
-    step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
-    step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
-
-    __asm__ __volatile__(
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_15]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_14]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-
-        "lbu      %[temp2],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp2],         %[temp2],           %[step3_13]     \n\t"
-        "lbux     %[temp0],         %[temp2](%[cm])                     \n\t"
-        "sb       %[temp0],         0(%[dest_pix1])                     \n\t"
-        "subu     %[dest_pix1],     %[dest_pix1],       %[dest_stride]  \n\t"
-        "lbu      %[temp3],         0(%[dest_pix1])                     \n\t"
-        "add      %[temp3],         %[temp3],           %[step3_12]     \n\t"
-        "lbux     %[temp1],         %[temp3](%[cm])                     \n\t"
-        "sb       %[temp1],         0(%[dest_pix1])                     \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
-          [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
-        : [cm] "r"(cm), [dest_stride] "r"(dest_stride),
-          [step3_12] "r"(step3_12), [step3_13] "r"(step3_13),
-          [step3_14] "r"(step3_14), [step3_15] "r"(step3_15));
-
-    input += 32;
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
deleted file mode 100644
index fa7703217..000000000
--- a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c
+++ /dev/null
@@ -1,1030 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
-                              uint32_t no_rows) {
-  int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
-  int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
-  int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
-  int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
-  int16_t step1_28, step1_29, step1_30, step1_31;
-  int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-  int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
-  int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
-  int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
-  int16_t step2_28, step2_29, step2_30, step2_31;
-  int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
-  int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
-  int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
-  int16_t step3_29, step3_30, step3_31;
-  int temp0, temp1, temp2, temp3;
-  int load1, load2, load3, load4;
-  int result1, result2;
-  int temp21;
-  int i;
-  const int const_2_power_13 = 8192;
-  const int32_t *input_int;
-
-  for (i = no_rows; i--;) {
-    input_int = (const int32_t *)input;
-
-    if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
-          input_int[4] | input_int[5] | input_int[6] | input_int[7] |
-          input_int[8] | input_int[9] | input_int[10] | input_int[11] |
-          input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
-      input += 32;
-
-      __asm__ __volatile__(
-          "sh     $zero,     0(%[output])     \n\t"
-          "sh     $zero,    64(%[output])     \n\t"
-          "sh     $zero,   128(%[output])     \n\t"
-          "sh     $zero,   192(%[output])     \n\t"
-          "sh     $zero,   256(%[output])     \n\t"
-          "sh     $zero,   320(%[output])     \n\t"
-          "sh     $zero,   384(%[output])     \n\t"
-          "sh     $zero,   448(%[output])     \n\t"
-          "sh     $zero,   512(%[output])     \n\t"
-          "sh     $zero,   576(%[output])     \n\t"
-          "sh     $zero,   640(%[output])     \n\t"
-          "sh     $zero,   704(%[output])     \n\t"
-          "sh     $zero,   768(%[output])     \n\t"
-          "sh     $zero,   832(%[output])     \n\t"
-          "sh     $zero,   896(%[output])     \n\t"
-          "sh     $zero,   960(%[output])     \n\t"
-          "sh     $zero,  1024(%[output])     \n\t"
-          "sh     $zero,  1088(%[output])     \n\t"
-          "sh     $zero,  1152(%[output])     \n\t"
-          "sh     $zero,  1216(%[output])     \n\t"
-          "sh     $zero,  1280(%[output])     \n\t"
-          "sh     $zero,  1344(%[output])     \n\t"
-          "sh     $zero,  1408(%[output])     \n\t"
-          "sh     $zero,  1472(%[output])     \n\t"
-          "sh     $zero,  1536(%[output])     \n\t"
-          "sh     $zero,  1600(%[output])     \n\t"
-          "sh     $zero,  1664(%[output])     \n\t"
-          "sh     $zero,  1728(%[output])     \n\t"
-          "sh     $zero,  1792(%[output])     \n\t"
-          "sh     $zero,  1856(%[output])     \n\t"
-          "sh     $zero,  1920(%[output])     \n\t"
-          "sh     $zero,  1984(%[output])     \n\t"
-
-          :
-          : [output] "r"(output));
-
-      output += 1;
-
-      continue;
-    }
-
-    /* prefetch row */
-    prefetch_load((const uint8_t *)(input + 32));
-    prefetch_load((const uint8_t *)(input + 48));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             2(%[input])                     \n\t"
-        "lh       %[load2],             62(%[input])                    \n\t"
-        "lh       %[load3],             34(%[input])                    \n\t"
-        "lh       %[load4],             30(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_31_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_1_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_1_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_31_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_15_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_17_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_17_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_15_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_17],          $ac1,           31              \n\t"
-        "extp     %[step1_30],          $ac3,           31              \n\t"
-        "add      %[step1_16],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_31],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16),
-          [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30),
-          [step1_31] "=r"(step1_31)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
-          [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             18(%[input])                    \n\t"
-        "lh       %[load2],             46(%[input])                    \n\t"
-        "lh       %[load3],             50(%[input])                    \n\t"
-        "lh       %[load4],             14(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_23_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_9_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_9_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_23_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_7_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_25_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_25_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_7_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-
-        "extp     %[step1_18],          $ac1,           31              \n\t"
-        "extp     %[step1_29],          $ac3,           31              \n\t"
-        "add      %[step1_19],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_28],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18),
-          [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28),
-          [step1_29] "=r"(step1_29)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
-          [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             10(%[input])                    \n\t"
-        "lh       %[load2],             54(%[input])                    \n\t"
-        "lh       %[load3],             42(%[input])                    \n\t"
-        "lh       %[load4],             22(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_27_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_5_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_5_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_27_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_11_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_21_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_21_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_11_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
-        "madd     $ac1,                 %[load2],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_12_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_20_64]  \n\t"
-
-        "extp     %[step1_21],          $ac1,           31              \n\t"
-        "extp     %[step1_26],          $ac3,           31              \n\t"
-        "add      %[step1_20],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_27],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20),
-          [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26),
-          [step1_27] "=r"(step1_27)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
-          [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             26(%[input])                    \n\t"
-        "lh       %[load2],             38(%[input])                    \n\t"
-        "lh       %[load3],             58(%[input])                    \n\t"
-        "lh       %[load4],              6(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_19_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_13_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_13_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_19_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_3_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_29_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_29_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_3_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_12_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_20_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_20_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_12_64]  \n\t"
-
-        "extp     %[step1_22],          $ac1,           31              \n\t"
-        "extp     %[step1_25],          $ac3,           31              \n\t"
-        "add      %[step1_23],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_24],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22),
-          [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24),
-          [step1_25] "=r"(step1_25)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
-          [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
-          [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],              4(%[input])                    \n\t"
-        "lh       %[load2],             60(%[input])                    \n\t"
-        "lh       %[load3],             36(%[input])                    \n\t"
-        "lh       %[load4],             28(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_30_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_2_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_2_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_30_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_14_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_18_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_18_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_14_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[temp3],       %[temp2]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load2],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load1],       %[cospi_24_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_9],           $ac1,           31              \n\t"
-        "extp     %[step2_14],          $ac3,           31              \n\t"
-        "add      %[step2_8],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_15],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8),
-          [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14),
-          [step2_15] "=r"(step2_15)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
-          [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "lh       %[load1],             20(%[input])                    \n\t"
-        "lh       %[load2],             44(%[input])                    \n\t"
-        "lh       %[load3],             52(%[input])                    \n\t"
-        "lh       %[load4],             12(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_22_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_10_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_10_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_22_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_6_64]   \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_26_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_26_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_6_64]   \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp1],       %[temp0]        \n\t"
-        "sub      %[load2],             %[temp2],       %[temp3]        \n\t"
-
-        "msub     $ac1,                 %[load1],       %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_8_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load1],       %[cospi_8_64]   \n\t"
-
-        "extp     %[step2_10],          $ac1,           31              \n\t"
-        "extp     %[step2_13],          $ac3,           31              \n\t"
-        "add      %[step2_11],          %[temp0],       %[temp1]        \n\t"
-        "add      %[step2_12],          %[temp2],       %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10),
-          [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12),
-          [step2_13] "=r"(step2_13)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
-          [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "sub      %[temp0],             %[step2_14],    %[step2_13]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_9]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_10]     \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "sub      %[temp1],             %[step2_14],    %[step2_13]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_9]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_10]     \n\t"
-        "madd     $ac1,                 %[temp1],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "sub      %[temp0],             %[step2_15],    %[step2_12]     \n\t"
-        "sub      %[temp0],             %[temp0],       %[step2_8]      \n\t"
-        "add      %[temp0],             %[temp0],       %[step2_11]     \n\t"
-        "madd     $ac2,                 %[temp0],       %[cospi_16_64]  \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "sub      %[temp1],             %[step2_15],    %[step2_12]     \n\t"
-        "add      %[temp1],             %[temp1],       %[step2_8]      \n\t"
-        "sub      %[temp1],             %[temp1],       %[step2_11]     \n\t"
-        "madd     $ac3,                 %[temp1],       %[cospi_16_64]  \n\t"
-
-        "add      %[step3_8],           %[step2_8],     %[step2_11]     \n\t"
-        "add      %[step3_9],           %[step2_9],     %[step2_10]     \n\t"
-        "add      %[step3_14],          %[step2_13],    %[step2_14]     \n\t"
-        "add      %[step3_15],          %[step2_12],    %[step2_15]     \n\t"
-
-        "extp     %[step3_10],          $ac0,           31              \n\t"
-        "extp     %[step3_13],          $ac1,           31              \n\t"
-        "extp     %[step3_11],          $ac2,           31              \n\t"
-        "extp     %[step3_12],          $ac3,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8),
-          [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10),
-          [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12),
-          [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14),
-          [step3_15] "=r"(step3_15)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
-          [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
-          [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
-          [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
-          [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
-
-    step2_18 = step1_17 - step1_18;
-    step2_29 = step1_30 - step1_29;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_18],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_29],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_18],          $ac0,           31              \n\t"
-
-        : [step3_18] "=r"(step3_18)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18),
-          [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
-    step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_19 = step1_16 - step1_19;
-    step2_28 = step1_31 - step1_28;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_19],    %[cospi_8_64]   \n\t"
-        "madd     $ac0,                 %[step2_28],    %[cospi_24_64]  \n\t"
-        "extp     %[step3_19],          $ac0,           31              \n\t"
-
-        : [step3_19] "=r"(step3_19)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19),
-          [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
-    step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_16 = step1_16 + step1_19;
-    step3_17 = step1_17 + step1_18;
-    step3_30 = step1_29 + step1_30;
-    step3_31 = step1_28 + step1_31;
-
-    step2_20 = step1_23 - step1_20;
-    step2_27 = step1_24 - step1_27;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "msub     $ac0,                 %[step2_20],    %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[step2_27],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_20],          $ac0,           31              \n\t"
-
-        : [step3_20] "=r"(step3_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
-    step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step2_21 = step1_22 - step1_21;
-    step2_26 = step1_25 - step1_26;
-
-    __asm__ __volatile__(
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "msub     $ac1,                 %[step2_21],    %[cospi_24_64]  \n\t"
-        "msub     $ac1,                 %[step2_26],    %[cospi_8_64]   \n\t"
-        "extp     %[step3_21],          $ac1,           31              \n\t"
-
-        : [step3_21] "=r"(step3_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21),
-          [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64));
-
-    temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
-    step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    step3_22 = step1_21 + step1_22;
-    step3_23 = step1_20 + step1_23;
-    step3_24 = step1_24 + step1_27;
-    step3_25 = step1_25 + step1_26;
-
-    step2_16 = step3_16 + step3_23;
-    step2_17 = step3_17 + step3_22;
-    step2_18 = step3_18 + step3_21;
-    step2_19 = step3_19 + step3_20;
-    step2_20 = step3_19 - step3_20;
-    step2_21 = step3_18 - step3_21;
-    step2_22 = step3_17 - step3_22;
-    step2_23 = step3_16 - step3_23;
-
-    step2_24 = step3_31 - step3_24;
-    step2_25 = step3_30 - step3_25;
-    step2_26 = step3_29 - step3_26;
-    step2_27 = step3_28 - step3_27;
-    step2_28 = step3_28 + step3_27;
-    step2_29 = step3_29 + step3_26;
-    step2_30 = step3_30 + step3_25;
-    step2_31 = step3_31 + step3_24;
-
-    __asm__ __volatile__(
-        "lh       %[load1],             0(%[input])                     \n\t"
-        "lh       %[load2],             32(%[input])                    \n\t"
-        "lh       %[load3],             16(%[input])                    \n\t"
-        "lh       %[load4],             48(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-        "add      %[result1],           %[load1],       %[load2]        \n\t"
-        "sub      %[result2],           %[load1],       %[load2]        \n\t"
-        "madd     $ac1,                 %[result1],     %[cospi_16_64]  \n\t"
-        "madd     $ac2,                 %[result2],     %[cospi_16_64]  \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-        "madd     $ac3,                 %[load3],       %[cospi_24_64]  \n\t"
-        "msub     $ac3,                 %[load4],       %[cospi_8_64]   \n\t"
-        "extp     %[temp2],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "madd     $ac1,                 %[load3],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_24_64]  \n\t"
-        "extp     %[temp3],             $ac1,           31              \n\t"
-
-        "add      %[step1_0],          %[temp0],        %[temp3]        \n\t"
-        "add      %[step1_1],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_2],          %[temp1],        %[temp2]        \n\t"
-        "sub      %[step1_3],          %[temp0],        %[temp3]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [result1] "=&r"(result1),
-          [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0),
-          [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2),
-          [step1_3] "=r"(step1_3)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64),
-          [cospi_8_64] "r"(cospi_8_64)
-
-            );
-
-    __asm__ __volatile__(
-        "lh       %[load1],             8(%[input])                     \n\t"
-        "lh       %[load2],             56(%[input])                    \n\t"
-        "lh       %[load3],             40(%[input])                    \n\t"
-        "lh       %[load4],             24(%[input])                    \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_28_64]  \n\t"
-        "msub     $ac1,                 %[load2],       %[cospi_4_64]   \n\t"
-        "extp     %[temp0],             $ac1,           31              \n\t"
-
-        "madd     $ac3,                 %[load1],       %[cospi_4_64]   \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_28_64]  \n\t"
-        "extp     %[temp3],             $ac3,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac2                            \n\t"
-        "mthi     $zero,                $ac2                            \n\t"
-
-        "madd     $ac2,                 %[load3],       %[cospi_12_64]  \n\t"
-        "msub     $ac2,                 %[load4],       %[cospi_20_64]  \n\t"
-        "extp     %[temp1],             $ac2,           31              \n\t"
-
-        "madd     $ac1,                 %[load3],       %[cospi_20_64]  \n\t"
-        "madd     $ac1,                 %[load4],       %[cospi_12_64]  \n\t"
-        "extp     %[temp2],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac3                            \n\t"
-        "mthi     $zero,                $ac3                            \n\t"
-
-        "sub      %[load1],             %[temp3],       %[temp2]        \n\t"
-        "sub      %[load1],             %[load1],       %[temp0]        \n\t"
-        "add      %[load1],             %[load1],       %[temp1]        \n\t"
-
-        "sub      %[load2],             %[temp0],       %[temp1]        \n\t"
-        "sub      %[load2],             %[load2],       %[temp2]        \n\t"
-        "add      %[load2],             %[load2],       %[temp3]        \n\t"
-
-        "madd     $ac1,                 %[load1],       %[cospi_16_64]  \n\t"
-        "madd     $ac3,                 %[load2],       %[cospi_16_64]  \n\t"
-
-        "extp     %[step1_5],           $ac1,           31              \n\t"
-        "extp     %[step1_6],           $ac3,           31              \n\t"
-        "add      %[step1_4],           %[temp0],       %[temp1]        \n\t"
-        "add      %[step1_7],           %[temp3],       %[temp2]        \n\t"
-
-        : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-          [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
-          [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4),
-          [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6),
-          [step1_7] "=r"(step1_7)
-        : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_16_64] "r"(cospi_16_64));
-
-    step2_0 = step1_0 + step1_7;
-    step2_1 = step1_1 + step1_6;
-    step2_2 = step1_2 + step1_5;
-    step2_3 = step1_3 + step1_4;
-    step2_4 = step1_3 - step1_4;
-    step2_5 = step1_2 - step1_5;
-    step2_6 = step1_1 - step1_6;
-    step2_7 = step1_0 - step1_7;
-
-    step1_0 = step2_0 + step3_15;
-    step1_1 = step2_1 + step3_14;
-    step1_2 = step2_2 + step3_13;
-    step1_3 = step2_3 + step3_12;
-    step1_4 = step2_4 + step3_11;
-    step1_5 = step2_5 + step3_10;
-    step1_6 = step2_6 + step3_9;
-    step1_7 = step2_7 + step3_8;
-    step1_8 = step2_7 - step3_8;
-    step1_9 = step2_6 - step3_9;
-    step1_10 = step2_5 - step3_10;
-    step1_11 = step2_4 - step3_11;
-    step1_12 = step2_3 - step3_12;
-    step1_13 = step2_2 - step3_13;
-    step1_14 = step2_1 - step3_14;
-    step1_15 = step2_0 - step3_15;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_27],    %[step2_20]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_20],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
-          [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_20 + step2_27) * cospi_16_64;
-    step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_26],    %[step2_21]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_21],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26),
-          [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_21 + step2_26) * cospi_16_64;
-    step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_25],    %[step2_22]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_22],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25),
-          [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_22 + step2_25) * cospi_16_64;
-    step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    __asm__ __volatile__(
-        "sub      %[temp0],             %[step2_24],    %[step2_23]     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "madd     $ac0,                 %[temp0],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_23],          $ac0,           31              \n\t"
-
-        : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23)
-        : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24),
-          [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64));
-
-    temp21 = (step2_23 + step2_24) * cospi_16_64;
-    step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
-
-    // final stage
-    output[0 * 32] = step1_0 + step2_31;
-    output[1 * 32] = step1_1 + step2_30;
-    output[2 * 32] = step1_2 + step2_29;
-    output[3 * 32] = step1_3 + step2_28;
-    output[4 * 32] = step1_4 + step1_27;
-    output[5 * 32] = step1_5 + step1_26;
-    output[6 * 32] = step1_6 + step1_25;
-    output[7 * 32] = step1_7 + step1_24;
-    output[8 * 32] = step1_8 + step1_23;
-    output[9 * 32] = step1_9 + step1_22;
-    output[10 * 32] = step1_10 + step1_21;
-    output[11 * 32] = step1_11 + step1_20;
-    output[12 * 32] = step1_12 + step2_19;
-    output[13 * 32] = step1_13 + step2_18;
-    output[14 * 32] = step1_14 + step2_17;
-    output[15 * 32] = step1_15 + step2_16;
-    output[16 * 32] = step1_15 - step2_16;
-    output[17 * 32] = step1_14 - step2_17;
-    output[18 * 32] = step1_13 - step2_18;
-    output[19 * 32] = step1_12 - step2_19;
-    output[20 * 32] = step1_11 - step1_20;
-    output[21 * 32] = step1_10 - step1_21;
-    output[22 * 32] = step1_9 - step1_22;
-    output[23 * 32] = step1_8 - step1_23;
-    output[24 * 32] = step1_7 - step1_24;
-    output[25 * 32] = step1_6 - step1_25;
-    output[26 * 32] = step1_5 - step1_26;
-    output[27 * 32] = step1_4 - step1_27;
-    output[28 * 32] = step1_3 - step2_28;
-    output[29 * 32] = step1_2 - step2_29;
-    output[30 * 32] = step1_1 - step2_30;
-    output[31 * 32] = step1_0 - step2_31;
-
-    input += 32;
-    output += 1;
-  }
-}
-
-void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  // Rows
-  idct32_rows_dspr2(input, outptr, 32);
-
-  // Columns
-  aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride);
-}
-
-void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
-  int16_t *outptr = out;
-  uint32_t i;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  // Rows
-  idct32_rows_dspr2(input, outptr, 8);
-
-  outptr += 8;
-  __asm__ __volatile__(
-      "sw     $zero,      0(%[outptr])     \n\t"
-      "sw     $zero,      4(%[outptr])     \n\t"
-      "sw     $zero,      8(%[outptr])     \n\t"
-      "sw     $zero,     12(%[outptr])     \n\t"
-      "sw     $zero,     16(%[outptr])     \n\t"
-      "sw     $zero,     20(%[outptr])     \n\t"
-      "sw     $zero,     24(%[outptr])     \n\t"
-      "sw     $zero,     28(%[outptr])     \n\t"
-      "sw     $zero,     32(%[outptr])     \n\t"
-      "sw     $zero,     36(%[outptr])     \n\t"
-      "sw     $zero,     40(%[outptr])     \n\t"
-      "sw     $zero,     44(%[outptr])     \n\t"
-
-      :
-      : [outptr] "r"(outptr));
-
-  for (i = 0; i < 31; ++i) {
-    outptr += 32;
-
-    __asm__ __volatile__(
-        "sw     $zero,      0(%[outptr])     \n\t"
-        "sw     $zero,      4(%[outptr])     \n\t"
-        "sw     $zero,      8(%[outptr])     \n\t"
-        "sw     $zero,     12(%[outptr])     \n\t"
-        "sw     $zero,     16(%[outptr])     \n\t"
-        "sw     $zero,     20(%[outptr])     \n\t"
-        "sw     $zero,     24(%[outptr])     \n\t"
-        "sw     $zero,     28(%[outptr])     \n\t"
-        "sw     $zero,     32(%[outptr])     \n\t"
-        "sw     $zero,     36(%[outptr])     \n\t"
-        "sw     $zero,     40(%[outptr])     \n\t"
-        "sw     $zero,     44(%[outptr])     \n\t"
-
-        :
-        : [outptr] "r"(outptr));
-  }
-
-  // Columns
-  aom_idct32_cols_add_blk_dspr2(out, dest, stride);
-}
-
-void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  int r, out;
-  int32_t a1, absa1;
-  int32_t vector_a1;
-  int32_t t1, t2, t3, t4;
-  int32_t vector_1, vector_2, vector_3, vector_4;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__(
-      "addi     %[out],    %[out],    32      \n\t"
-      "sra      %[a1],     %[out],    6       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],     %[a1]         \n\t"
-        "replv.qb   %[vector_a1], %[absa1]      \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 32; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-
-          "lw             %[t1],          16(%[dest])                     \n\t"
-          "lw             %[t2],          20(%[dest])                     \n\t"
-          "lw             %[t3],          24(%[dest])                     \n\t"
-          "lw             %[t4],          28(%[dest])                     \n\t"
-          "subu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "subu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    16(%[dest])                     \n\t"
-          "sw             %[vector_2],    20(%[dest])                     \n\t"
-          "sw             %[vector_3],    24(%[dest])                     \n\t"
-          "sw             %[vector_4],    28(%[dest])                     \n\t"
-
-          "add            %[dest],        %[dest],        %[stride]       \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
-
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 32; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t1],          0(%[dest])                      \n\t"
-          "lw             %[t2],          4(%[dest])                      \n\t"
-          "lw             %[t3],          8(%[dest])                      \n\t"
-          "lw             %[t4],          12(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    0(%[dest])                      \n\t"
-          "sw             %[vector_2],    4(%[dest])                      \n\t"
-          "sw             %[vector_3],    8(%[dest])                      \n\t"
-          "sw             %[vector_4],    12(%[dest])                     \n\t"
-
-          "lw             %[t1],          16(%[dest])                     \n\t"
-          "lw             %[t2],          20(%[dest])                     \n\t"
-          "lw             %[t3],          24(%[dest])                     \n\t"
-          "lw             %[t4],          28(%[dest])                     \n\t"
-          "addu_s.qb      %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_3],    %[t3],          %[vector_a1]    \n\t"
-          "addu_s.qb      %[vector_4],    %[t4],          %[vector_a1]    \n\t"
-          "sw             %[vector_1],    16(%[dest])                     \n\t"
-          "sw             %[vector_2],    20(%[dest])                     \n\t"
-          "sw             %[vector_3],    24(%[dest])                     \n\t"
-          "sw             %[vector_4],    28(%[dest])                     \n\t"
-
-          "add            %[dest],        %[dest],        %[stride]       \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
-            [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
-            [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
-            [dest] "+&r"(dest)
-          : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
deleted file mode 100644
index e6d0367cd..000000000
--- a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
-  int16_t step_0, step_1, step_2, step_3;
-  int Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int i;
-
-  for (i = 4; i--;) {
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-
-        "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp1],             8(%[output])                    \n\t"
-
-        "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
-        "sh       %[Temp2],             16(%[output])                   \n\t"
-
-        "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
-        "sh       %[Temp3],             24(%[output])                   \n\t"
-
-        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
-          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
-
-    input += 4;
-    output += 1;
-  }
-}
-
-void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                     int dest_stride) {
-  int16_t step_0, step_1, step_2, step_3;
-  int Temp0, Temp1, Temp2, Temp3;
-  const int const_2_power_13 = 8192;
-  int i;
-  uint8_t *dest_pix;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 4; ++i) {
-    dest_pix = (dest + i);
-
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[2]) * cospi_16_64;
-          step_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[2]) * cospi_16_64;
-          step_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             4(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "extp     %[step_0],            $ac0,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "extp     %[step_1],            $ac1,           31              \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        /*
-          temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-          step_2 = dct_const_round_shift(temp1);
-        */
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "extp     %[step_2],            $ac0,           31              \n\t"
-
-        /*
-          temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-          step_3 = dct_const_round_shift(temp2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[step_3],            $ac1,           31              \n\t"
-
-        /*
-          output[0]  = step_0 + step_3;
-          output[4]  = step_1 + step_2;
-          output[8]  = step_1 - step_2;
-          output[12] = step_0 - step_3;
-        */
-        "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "addi     %[Temp0],             %[Temp0],       8               \n\t"
-        "sra      %[Temp0],             %[Temp0],       4               \n\t"
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-        : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
-          [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
-          [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
-
-    input += 4;
-  }
-}
-
-void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  // Rows
-  aom_idct4_rows_dspr2(input, outptr);
-
-  // Columns
-  aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  int a1, absa1;
-  int r;
-  int32_t out;
-  int t2, vector_a1, vector_a;
-  uint32_t pos = 45;
-  int16_t input_dc = input[0];
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
-  __asm__ __volatile__(
-      "addi     %[out],     %[out],    8       \n\t"
-      "sra      %[a1],      %[out],    4       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],     %[a1]         \n\t"
-        "replv.qb   %[vector_a1], %[absa1]      \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__(
-          "lw             %[t2],          0(%[dest])                      \n\t"
-          "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
-          "sw             %[vector_a],    0(%[dest])                      \n\t"
-          "add            %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 4; r--;) {
-      __asm__ __volatile__(
-          "lw           %[t2],          0(%[dest])                        \n\t"
-          "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
-          "sw           %[vector_a],    0(%[dest])                        \n\t"
-          "add          %[dest],        %[dest],          %[dest_stride]  \n\t"
-
-          : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-
-void iadst4_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
-
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
deleted file mode 100644
index 0a20f76f2..000000000
--- a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/mips/inv_txfm_dspr2.h"
-#include "aom_dsp/txfm_common.h"
-
-#if HAVE_DSPR2
-void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  const int const_2_power_13 = 8192;
-  int Temp0, Temp1, Temp2, Temp3, Temp4;
-  int i;
-
-  for (i = no_rows; i--;) {
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[Temp4],             $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp0],             0(%[output])                    \n\t"
-        "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp1],             16(%[output])                   \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp0],             32(%[output])                   \n\t"
-        "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp1],             48(%[output])                   \n\t"
-
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "sh       %[Temp0],             64(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
-        "sh       %[Temp1],             80(%[output])                   \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "sh       %[Temp0],             96(%[output])                   \n\t"
-        "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
-        "sh       %[Temp1],             112(%[output])                  \n\t"
-
-        : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
-          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
-          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
-          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
-          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_24_64] "r"(cospi_24_64), [output] "r"(output),
-          [input] "r"(input));
-
-    input += 8;
-    output += 1;
-  }
-}
-
-void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
-  int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
-  int Temp0, Temp1, Temp2, Temp3;
-  int i;
-  const int const_2_power_13 = 8192;
-  uint8_t *dest_pix;
-  uint8_t *cm = aom_ff_cropTbl;
-
-  /* prefetch aom_ff_cropTbl */
-  prefetch_load(aom_ff_cropTbl);
-  prefetch_load(aom_ff_cropTbl + 32);
-  prefetch_load(aom_ff_cropTbl + 64);
-  prefetch_load(aom_ff_cropTbl + 96);
-  prefetch_load(aom_ff_cropTbl + 128);
-  prefetch_load(aom_ff_cropTbl + 160);
-  prefetch_load(aom_ff_cropTbl + 192);
-  prefetch_load(aom_ff_cropTbl + 224);
-
-  for (i = 0; i < 8; ++i) {
-    dest_pix = (dest + i);
-
-    __asm__ __volatile__(
-        /*
-          temp_1 = (input[0] + input[4]) * cospi_16_64;
-          step2_0 = dct_const_round_shift(temp_1);
-
-          temp_2 = (input[0] - input[4]) * cospi_16_64;
-          step2_1 = dct_const_round_shift(temp_2);
-        */
-        "lh       %[Temp0],             0(%[input])                     \n\t"
-        "lh       %[Temp1],             8(%[input])                     \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
-        "extp     %[step1_6],           $ac0,           31              \n\t"
-
-        "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
-        "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "extp     %[Temp2],             $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
-          step2_2 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             4(%[input])                     \n\t"
-        "lh       %[Temp1],             12(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "extp     %[Temp3],             $ac0,           31              \n\t"
-
-        /*
-          step1_1 = step2_1 + step2_2;
-          step1_2 = step2_1 - step2_2;
-        */
-        "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
-        "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
-
-        /*
-          temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
-          step2_3 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
-        "extp     %[Temp1],             $ac1,           31              \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-
-        /*
-          step1_0 = step2_0 + step2_3;
-          step1_3 = step2_0 - step2_3;
-        */
-        "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
-        "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
-
-        /*
-          temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-          step1_4 = dct_const_round_shift(temp_1);
-        */
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp1],             14(%[input])                    \n\t"
-        "lh       %[Temp0],             2(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
-        "extp     %[step1_4],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-          step1_7 = dct_const_round_shift(temp_2);
-        */
-        "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
-        "extp     %[step1_7],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-          step1_5 = dct_const_round_shift(temp_1);
-        */
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-
-        /*
-          temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-          step1_6 = dct_const_round_shift(temp_2);
-        */
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-        "lh       %[Temp0],             10(%[input])                    \n\t"
-        "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
-        "lh       %[Temp1],             6(%[input])                     \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /*
-          temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
-          temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
-        */
-        "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
-        "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
-        "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
-        "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
-        "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
-
-        "mtlo     %[const_2_power_13],  $ac0                            \n\t"
-        "mthi     $zero,                $ac0                            \n\t"
-        "mtlo     %[const_2_power_13],  $ac1                            \n\t"
-        "mthi     $zero,                $ac1                            \n\t"
-
-        "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
-        "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
-
-        /*
-          step1_4 = step1_4 + step1_5;
-          step1_7 = step1_6 + step1_7;
-        */
-        "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
-        "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
-
-        "extp     %[step1_5],           $ac0,           31              \n\t"
-        "extp     %[step1_6],           $ac1,           31              \n\t"
-
-        /* add block */
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-        "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
-
-        "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
-        "addi     %[Temp0],             %[Temp0],       16              \n\t"
-        "sra      %[Temp0],             %[Temp0],       5               \n\t"
-        "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
-        "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
-        "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
-
-        : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1),
-          [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3),
-          [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5),
-          [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7),
-          [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix)
-        : [const_2_power_13] "r"(const_2_power_13),
-          [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64),
-          [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64),
-          [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64),
-          [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm),
-          [dest_stride] "r"(dest_stride));
-
-    input += 8;
-  }
-}
-
-void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 8);
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest,
-                              int dest_stride) {
-  DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
-  int16_t *outptr = out;
-  uint32_t pos = 45;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp    %[pos],    1    \n\t" : : [pos] "r"(pos));
-
-  // First transform rows
-  idct8_rows_dspr2(input, outptr, 4);
-
-  outptr += 4;
-
-  __asm__ __volatile__(
-      "sw  $zero,   0(%[outptr])  \n\t"
-      "sw  $zero,   4(%[outptr])  \n\t"
-      "sw  $zero,  16(%[outptr])  \n\t"
-      "sw  $zero,  20(%[outptr])  \n\t"
-      "sw  $zero,  32(%[outptr])  \n\t"
-      "sw  $zero,  36(%[outptr])  \n\t"
-      "sw  $zero,  48(%[outptr])  \n\t"
-      "sw  $zero,  52(%[outptr])  \n\t"
-      "sw  $zero,  64(%[outptr])  \n\t"
-      "sw  $zero,  68(%[outptr])  \n\t"
-      "sw  $zero,  80(%[outptr])  \n\t"
-      "sw  $zero,  84(%[outptr])  \n\t"
-      "sw  $zero,  96(%[outptr])  \n\t"
-      "sw  $zero, 100(%[outptr])  \n\t"
-      "sw  $zero, 112(%[outptr])  \n\t"
-      "sw  $zero, 116(%[outptr])  \n\t"
-
-      :
-      : [outptr] "r"(outptr));
-
-  // Then transform columns and add to dest
-  idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
-}
-
-void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
-                             int dest_stride) {
-  uint32_t pos = 45;
-  int32_t out;
-  int32_t r;
-  int32_t a1, absa1;
-  int32_t t1, t2, vector_a1, vector_1, vector_2;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-
-                       :
-                       : [pos] "r"(pos));
-
-  out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
-  __asm__ __volatile__(
-      "addi     %[out],     %[out],     16      \n\t"
-      "sra      %[a1],      %[out],     5       \n\t"
-
-      : [out] "+r"(out), [a1] "=r"(a1)
-      :);
-
-  if (a1 < 0) {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__(
-        "abs        %[absa1],       %[a1]       \n\t"
-        "replv.qb   %[vector_a1],   %[absa1]    \n\t"
-
-        : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
-        : [a1] "r"(a1));
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__(
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
-            [vector_2] "=&r"(vector_2), [dest] "+&r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  } else {
-    /* use quad-byte
-     * input and output memory are four byte aligned */
-    __asm__ __volatile__("replv.qb   %[vector_a1],   %[a1]   \n\t"
-
-                         : [vector_a1] "=r"(vector_a1)
-                         : [a1] "r"(a1));
-
-    for (r = 8; r--;) {
-      __asm__ __volatile__(
-          "lw           %[t1],          0(%[dest])                      \n\t"
-          "lw           %[t2],          4(%[dest])                      \n\t"
-          "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
-          "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
-          "sw           %[vector_1],    0(%[dest])                      \n\t"
-          "sw           %[vector_2],    4(%[dest])                      \n\t"
-          "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
-
-          : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1),
-            [vector_2] "=&r"(vector_2), [dest] "+r"(dest)
-          : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1));
-    }
-  }
-}
-
-void iadst8_dspr2(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-  int x0, x1, x2, x3, x4, x5, x6, x7;
-
-  x0 = input[7];
-  x1 = input[0];
-  x2 = input[5];
-  x3 = input[2];
-  x4 = input[3];
-  x5 = input[4];
-  x6 = input[1];
-  x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
-  x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
-  x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
-  x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
-  x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
-  x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
-  x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
-  x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
-  x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
-
-  output[0] = x0;
-  output[1] = -x4;
-  output[2] = x6;
-  output[3] = -x2;
-  output[4] = x3;
-  output[5] = -x7;
-  output[6] = x5;
-  output[7] = -x1;
-}
-#endif  // HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
index fc0c32ce3..38a10e9b2 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
@@ -404,10 +404,11 @@ void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   }
 }
 
-void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
-                                    const uint8_t *b_limit_ptr,
-                                    const uint8_t *limit_ptr,
-                                    const uint8_t *thresh_ptr, int32_t count) {
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr,
+                                        int32_t count) {
   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
   uint8_t early_exit = 0;
 
@@ -639,19 +640,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
       }
     }
   } else {
-    aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
-                                   thresh_ptr, count);
+    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+                                count);
   }
 }
 
-void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit_ptr,
-                                   const uint8_t *limit_ptr,
-                                   const uint8_t *thresh_ptr) {
+void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr) {
   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
 }
 
-void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
                                     const uint8_t *thresh_ptr) {
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
index 883d0523d..8c41278be 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
index 72df09823..3e38ef3fb 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
index 3e6994714..cb599cf2e 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
index 8db3e521f..6db1dac08 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -14,7 +14,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
index a3b5a9eb1..b67ccfe9d 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 8d2fd69f7..34733e42e 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
@@ -718,14 +719,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
   }
 }
 
-void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh) {
+void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+                                 const uint8_t *blimit, const uint8_t *limit,
+                                 const uint8_t *thresh) {
   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
 }
 
-void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh) {
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
index 28528869b..3d3f1ec97 100644
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 #include "aom_dsp/mips/common_dspr2.h"
 #include "aom_dsp/mips/loopfilter_filters_dspr2.h"
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
index 48fbcfd47..eb919d42b 100644
--- a/third_party/aom/aom_dsp/mips/macros_msa.h
+++ b/third_party/aom/aom_dsp/mips/macros_msa.h
@@ -14,7 +14,8 @@
 
 #include <msa.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
index 258eb5c07..58cdd80d9 100644
--- a/third_party/aom/aom_dsp/mips/sad_msa.c
+++ b/third_party/aom/aom_dsp/mips/sad_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
@@ -160,640 +161,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
   return sad;
 }
 
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref, ref0, ref1, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
-    ref0_4 = LD_UB(ref + 64);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref0, ref1, ref;
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1;
-  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  const uint8_t *src_dup, *ref_dup;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-  v4u32 sad;
-
-  src_dup = src;
-  ref_dup = ref;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[3] = HADD_SW_S32(sad);
-
-  sad0_0 = (v8u16)__msa_ldi_h(0);
-  sad0_1 = (v8u16)__msa_ldi_h(0);
-  sad1_0 = (v8u16)__msa_ldi_h(0);
-  sad1_1 = (v8u16)__msa_ldi_h(0);
-  sad2_0 = (v8u16)__msa_ldi_h(0);
-  sad2_1 = (v8u16)__msa_ldi_h(0);
-  sad3_0 = (v8u16)__msa_ldi_h(0);
-  sad3_1 = (v8u16)__msa_ldi_h(0);
-
-  for (ht_cnt = 64; ht_cnt--;) {
-    LD_UB4(src_dup, 16, src0, src1, src2, src3);
-    src_dup += src_stride;
-    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref_dup += ref_stride;
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[4] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[5] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[6] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[7] = HADD_SW_S32(sad);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1290,76 +657,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   }
 
-#define AOM_SAD_4xHEIGHTx3_MSA(height)                                   \
-  void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_8xHEIGHTx3_MSA(height)                                   \
-  void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_16xHEIGHTx3_MSA(height)                                   \
-  void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_32xHEIGHTx3_MSA(height)                                   \
-  void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_64xHEIGHTx3_MSA(height)                                   \
-  void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_4xHEIGHTx8_MSA(height)                                   \
-  void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_8xHEIGHTx8_MSA(height)                                   \
-  void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_16xHEIGHTx8_MSA(height)                                   \
-  void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_32xHEIGHTx8_MSA(height)                                   \
-  void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define AOM_SAD_64xHEIGHTx8_MSA(height)                                   \
-  void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *const refs[],            \
@@ -1438,92 +735,66 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 /* clang-format off */
 // 64x64
 AOM_SAD_64xHEIGHT_MSA(64)
-AOM_SAD_64xHEIGHTx3_MSA(64)
-AOM_SAD_64xHEIGHTx8_MSA(64)
 AOM_SAD_64xHEIGHTx4D_MSA(64)
 AOM_AVGSAD_64xHEIGHT_MSA(64)
 
 // 64x32
 AOM_SAD_64xHEIGHT_MSA(32)
-AOM_SAD_64xHEIGHTx3_MSA(32)
-AOM_SAD_64xHEIGHTx8_MSA(32)
 AOM_SAD_64xHEIGHTx4D_MSA(32)
 AOM_AVGSAD_64xHEIGHT_MSA(32)
 
 // 32x64
 AOM_SAD_32xHEIGHT_MSA(64)
-AOM_SAD_32xHEIGHTx3_MSA(64)
-AOM_SAD_32xHEIGHTx8_MSA(64)
 AOM_SAD_32xHEIGHTx4D_MSA(64)
 AOM_AVGSAD_32xHEIGHT_MSA(64)
 
 // 32x32
 AOM_SAD_32xHEIGHT_MSA(32)
-AOM_SAD_32xHEIGHTx3_MSA(32)
-AOM_SAD_32xHEIGHTx8_MSA(32)
 AOM_SAD_32xHEIGHTx4D_MSA(32)
 AOM_AVGSAD_32xHEIGHT_MSA(32)
 
 // 32x16
 AOM_SAD_32xHEIGHT_MSA(16)
-AOM_SAD_32xHEIGHTx3_MSA(16)
-AOM_SAD_32xHEIGHTx8_MSA(16)
 AOM_SAD_32xHEIGHTx4D_MSA(16)
 AOM_AVGSAD_32xHEIGHT_MSA(16)
 
 // 16x32
 AOM_SAD_16xHEIGHT_MSA(32)
-AOM_SAD_16xHEIGHTx3_MSA(32)
-AOM_SAD_16xHEIGHTx8_MSA(32)
 AOM_SAD_16xHEIGHTx4D_MSA(32)
 AOM_AVGSAD_16xHEIGHT_MSA(32)
 
 // 16x16
 AOM_SAD_16xHEIGHT_MSA(16)
-AOM_SAD_16xHEIGHTx3_MSA(16)
-AOM_SAD_16xHEIGHTx8_MSA(16)
 AOM_SAD_16xHEIGHTx4D_MSA(16)
 AOM_AVGSAD_16xHEIGHT_MSA(16)
 
 // 16x8
 AOM_SAD_16xHEIGHT_MSA(8)
-AOM_SAD_16xHEIGHTx3_MSA(8)
-AOM_SAD_16xHEIGHTx8_MSA(8)
 AOM_SAD_16xHEIGHTx4D_MSA(8)
 AOM_AVGSAD_16xHEIGHT_MSA(8)
 
 // 8x16
 AOM_SAD_8xHEIGHT_MSA(16)
-AOM_SAD_8xHEIGHTx3_MSA(16)
-AOM_SAD_8xHEIGHTx8_MSA(16)
 AOM_SAD_8xHEIGHTx4D_MSA(16)
 AOM_AVGSAD_8xHEIGHT_MSA(16)
 
 // 8x8
 AOM_SAD_8xHEIGHT_MSA(8)
-AOM_SAD_8xHEIGHTx3_MSA(8)
-AOM_SAD_8xHEIGHTx8_MSA(8)
 AOM_SAD_8xHEIGHTx4D_MSA(8)
 AOM_AVGSAD_8xHEIGHT_MSA(8)
 
 // 8x4
 AOM_SAD_8xHEIGHT_MSA(4)
-AOM_SAD_8xHEIGHTx3_MSA(4)
-AOM_SAD_8xHEIGHTx8_MSA(4)
 AOM_SAD_8xHEIGHTx4D_MSA(4)
 AOM_AVGSAD_8xHEIGHT_MSA(4)
 
 // 4x8
 AOM_SAD_4xHEIGHT_MSA(8)
-AOM_SAD_4xHEIGHTx3_MSA(8)
-AOM_SAD_4xHEIGHTx8_MSA(8)
 AOM_SAD_4xHEIGHTx4D_MSA(8)
 AOM_AVGSAD_4xHEIGHT_MSA(8)
 
 // 4x4
 AOM_SAD_4xHEIGHT_MSA(4)
-AOM_SAD_4xHEIGHTx3_MSA(4)
-AOM_SAD_4xHEIGHTx8_MSA(4)
 AOM_SAD_4xHEIGHTx4D_MSA(4)
 AOM_AVGSAD_4xHEIGHT_MSA(4)
     /* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
index 3eb85107d..a8ee85b6b 100644
--- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 #include "aom_dsp/mips/macros_msa.h"
 #include "aom_dsp/variance.h"
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
index 37b89765d..bfed773ac 100644
--- a/third_party/aom/aom_dsp/mips/subtract_msa.c
+++ b/third_party/aom/aom_dsp/mips/subtract_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
deleted file mode 100644
index cba5d4445..000000000
--- a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
-  {                                                           \
-    v8i16 k0_m = __msa_fill_h(cnst0);                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
-                                                              \
-    s0_m = (v4i32)__msa_fill_h(cnst1);                        \
-    k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                  \
-                                                              \
-    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                   \
-    ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                      \
-    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);          \
-    SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
-    out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
-                                                              \
-    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);          \
-    SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
-    out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
-  }
-
-#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0,   \
-                              dst1, dst2, dst3)                               \
-  {                                                                           \
-    v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m;                                  \
-    v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m;                                  \
-                                                                              \
-    DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m,  \
-                tp4_m);                                                       \
-    DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m,  \
-                tp8_m);                                                       \
-    BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m);      \
-    BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m);      \
-    SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS);                  \
-    SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS);                  \
-    PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \
-                dst1, dst2, dst3);                                            \
-  }
-
-#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2)           \
-  ({                                                   \
-    v8i16 dst_m;                                       \
-    v4i32 tp0_m, tp1_m;                                \
-                                                       \
-    DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);     \
-    SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);         \
-    dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
-                                                       \
-    dst_m;                                             \
-  })
-
-#define MADD_SHORT(m0, m1, c0, c1, res0, res1)                              \
-  {                                                                         \
-    v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                               \
-    v8i16 madd_s0_m, madd_s1_m;                                             \
-                                                                            \
-    ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                              \
-    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \
-                madd0_m, madd1_m, madd2_m, madd3_m);                        \
-    SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);        \
-    PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);            \
-  }
-
-#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1,   \
-                out2, out3)                                                   \
-  {                                                                           \
-    v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                         \
-    v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                         \
-                                                                              \
-    ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                            \
-    ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                            \
-    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \
-                cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
-    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
-    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                      \
-    DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \
-                cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);                        \
-    BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m);  \
-    SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
-    PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                      \
-  }
-#endif  // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
index 745fdfc9c..065c09ac5 100644
--- a/third_party/aom/aom_dsp/mips/variance_msa.c
+++ b/third_party/aom/aom_dsp/mips/variance_msa.c
@@ -9,7 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/mips/macros_msa.h"
 
 #define CALC_MSE_B(src, ref, var)                                   \
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
new file mode 100644
index 000000000..a1287f74f
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -0,0 +1,1460 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "av1/encoder/mathutils.h"
+
+#define kLowPolyNumParams 3
+
+static const int kMaxLag = 4;
+
+// Defines a function that can be used to obtain the mean of a block for the
+// provided data type (uint8_t, or uint16_t)
+#define GET_BLOCK_MEAN(INT_TYPE, suffix)                                    \
+  static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
+                                        int stride, int x_o, int y_o,       \
+                                        int block_size) {                   \
+    const int max_h = AOMMIN(h - y_o, block_size);                          \
+    const int max_w = AOMMIN(w - x_o, block_size);                          \
+    double block_mean = 0;                                                  \
+    for (int y = 0; y < max_h; ++y) {                                       \
+      for (int x = 0; x < max_w; ++x) {                                     \
+        block_mean += data[(y_o + y) * stride + x_o + x];                   \
+      }                                                                     \
+    }                                                                       \
+    return block_mean / (max_w * max_h);                                    \
+  }
+
+GET_BLOCK_MEAN(uint8_t, lowbd);
+GET_BLOCK_MEAN(uint16_t, highbd);
+
+static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+                                    int stride, int x_o, int y_o,
+                                    int block_size, int use_highbd) {
+  if (use_highbd)
+    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+                                 block_size);
+  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+}
+
+// Defines a function that can be used to obtain the variance of a block
+// for the provided data type (uint8_t, or uint16_t)
+#define GET_NOISE_VAR(INT_TYPE, suffix)                                  \
+  static double get_noise_var_##suffix(                                  \
+      const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
+      int h, int x_o, int y_o, int block_size_x, int block_size_y) {     \
+    const int max_h = AOMMIN(h - y_o, block_size_y);                     \
+    const int max_w = AOMMIN(w - x_o, block_size_x);                     \
+    double noise_var = 0;                                                \
+    double noise_mean = 0;                                               \
+    for (int y = 0; y < max_h; ++y) {                                    \
+      for (int x = 0; x < max_w; ++x) {                                  \
+        double noise = (double)data[(y_o + y) * stride + x_o + x] -      \
+                       denoised[(y_o + y) * stride + x_o + x];           \
+        noise_mean += noise;                                             \
+        noise_var += noise * noise;                                      \
+      }                                                                  \
+    }                                                                    \
+    noise_mean /= (max_w * max_h);                                       \
+    return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
+  }
+
+GET_NOISE_VAR(uint8_t, lowbd);
+GET_NOISE_VAR(uint16_t, highbd);
+
+static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+                                   int w, int h, int stride, int x_o, int y_o,
+                                   int block_size_x, int block_size_y,
+                                   int use_highbd) {
+  if (use_highbd)
+    return get_noise_var_highbd((const uint16_t *)data,
+                                (const uint16_t *)denoised, w, h, stride, x_o,
+                                y_o, block_size_x, block_size_y);
+  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
+                             block_size_x, block_size_y);
+}
+
+static void equation_system_clear(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
+  memset(eqns->x, 0, sizeof(*eqns->x) * n);
+  memset(eqns->b, 0, sizeof(*eqns->b) * n);
+}
+
+static void equation_system_copy(aom_equation_system_t *dst,
+                                 const aom_equation_system_t *src) {
+  const int n = dst->n;
+  memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
+  memcpy(dst->x, src->x, sizeof(*dst->x) * n);
+  memcpy(dst->b, src->b, sizeof(*dst->b) * n);
+}
+
+static int equation_system_init(aom_equation_system_t *eqns, int n) {
+  eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
+  eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
+  eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
+  eqns->n = n;
+  if (!eqns->A || !eqns->b || !eqns->x) {
+    fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
+    aom_free(eqns->A);
+    aom_free(eqns->b);
+    aom_free(eqns->x);
+    memset(eqns, 0, sizeof(*eqns));
+    return 0;
+  }
+  equation_system_clear(eqns);
+  return 1;
+}
+
+static int equation_system_solve(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  double *b = (double *)aom_malloc(sizeof(*b) * n);
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  int ret = 0;
+  if (A == NULL || b == NULL) {
+    fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
+    aom_free(b);
+    aom_free(A);
+    return 0;
+  }
+  memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
+  memcpy(b, eqns->b, sizeof(*eqns->b) * n);
+  ret = linsolve(n, A, eqns->n, b, eqns->x);
+  aom_free(b);
+  aom_free(A);
+
+  if (ret == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static void equation_system_add(aom_equation_system_t *dest,
+                                aom_equation_system_t *src) {
+  const int n = dest->n;
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      dest->A[i * n + j] += src->A[i * n + j];
+    }
+    dest->b[i] += src->b[i];
+  }
+}
+
+static void equation_system_free(aom_equation_system_t *eqns) {
+  if (!eqns) return;
+  aom_free(eqns->A);
+  aom_free(eqns->b);
+  aom_free(eqns->x);
+  memset(eqns, 0, sizeof(*eqns));
+}
+
+static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
+  equation_system_clear(&solver->eqns);
+  solver->num_equations = 0;
+  solver->total = 0;
+}
+
+static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
+                                      aom_noise_strength_solver_t *src) {
+  equation_system_add(&dest->eqns, &src->eqns);
+  dest->num_equations += src->num_equations;
+  dest->total += src->total;
+}
+
+// Return the number of coefficients required for the given parameters
+static int num_coeffs(const aom_noise_model_params_t params) {
+  const int n = 2 * params.lag + 1;
+  switch (params.shape) {
+    case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
+    case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
+  }
+  return 0;
+}
+
+static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
+  const int kNumBins = 20;
+  if (!equation_system_init(&state->eqns, n)) {
+    fprintf(stderr, "Failed initialization noise state with size %d\n", n);
+    return 0;
+  }
+  state->ar_gain = 1.0;
+  state->num_observations = 0;
+  return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
+                                        bit_depth);
+}
+
+static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
+  const double kTolerance = 1e-6;
+  const int last = eqns->n - 1;
+  // Set all of the AR coefficients to zero, but try to solve for correlation
+  // with the luma channel
+  memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
+  if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
+    eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
+  }
+}
+
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
+  if (!lut) return 0;
+  lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
+  if (!lut->points) return 0;
+  lut->num_points = num_points;
+  memset(lut->points, 0, sizeof(*lut->points) * num_points);
+  return 1;
+}
+
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
+  if (!lut) return;
+  aom_free(lut->points);
+  memset(lut, 0, sizeof(*lut));
+}
+
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x) {
+  int i = 0;
+  // Constant extrapolation for x <  x_0.
+  if (x < lut->points[0][0]) return lut->points[0][1];
+  for (i = 0; i < lut->num_points - 1; ++i) {
+    if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
+      const double a =
+          (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
+      return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
+    }
+  }
+  // Constant extrapolation for x > x_{n-1}
+  return lut->points[lut->num_points - 1][1];
+}
+
+static double noise_strength_solver_get_bin_index(
+    const aom_noise_strength_solver_t *solver, double value) {
+  const double val =
+      fclamp(value, solver->min_intensity, solver->max_intensity);
+  const double range = solver->max_intensity - solver->min_intensity;
+  return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
+}
+
+static double noise_strength_solver_get_value(
+    const aom_noise_strength_solver_t *solver, double x) {
+  const double bin = noise_strength_solver_get_bin_index(solver, x);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
+}
+
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
+  const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  const int n = solver->num_bins;
+  solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
+  solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
+  solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
+  solver->eqns.b[bin_i1] += a * noise_std;
+  solver->total += noise_std;
+  solver->num_equations++;
+}
+
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
+  // Add regularization proportional to the number of constraints
+  const int n = solver->num_bins;
+  const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
+  int result = 0;
+  double mean = 0;
+
+  // Do this in a non-destructive manner so it is not confusing to the caller
+  double *old_A = solver->eqns.A;
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  if (!A) {
+    fprintf(stderr, "Unable to allocate copy of A\n");
+    return 0;
+  }
+  memcpy(A, old_A, sizeof(*A) * n * n);
+
+  for (int i = 0; i < n; ++i) {
+    const int i_lo = AOMMAX(0, i - 1);
+    const int i_hi = AOMMIN(n - 1, i + 1);
+    A[i * n + i_lo] -= kAlpha;
+    A[i * n + i] += 2 * kAlpha;
+    A[i * n + i_hi] -= kAlpha;
+  }
+
+  // Small regularization to give average noise strength
+  mean = solver->total / solver->num_equations;
+  for (int i = 0; i < n; ++i) {
+    A[i * n + i] += 1.0 / 8192.;
+    solver->eqns.b[i] += mean / 8192.;
+  }
+  solver->eqns.A = A;
+  result = equation_system_solve(&solver->eqns);
+  solver->eqns.A = old_A;
+
+  aom_free(A);
+  return result;
+}
+
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth) {
+  if (!solver) return 0;
+  memset(solver, 0, sizeof(*solver));
+  solver->num_bins = num_bins;
+  solver->min_intensity = 0;
+  solver->max_intensity = (1 << bit_depth) - 1;
+  solver->total = 0;
+  solver->num_equations = 0;
+  return equation_system_init(&solver->eqns, num_bins);
+}
+
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
+  if (!solver) return;
+  equation_system_free(&solver->eqns);
+}
+
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i) {
+  const double range = solver->max_intensity - solver->min_intensity;
+  const int n = solver->num_bins;
+  return ((double)i) / (n - 1) * range + solver->min_intensity;
+}
+
+// Computes the residual if a point were to be removed from the lut. This is
+// calculated as the area between the output of the solver and the line segment
+// that would be formed between [x_{i - 1}, x_{i + 1}).
+static void update_piecewise_linear_residual(
+    const aom_noise_strength_solver_t *solver,
+    const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
+  const double dx = 255. / solver->num_bins;
+  for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
+    const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
+                                    solver, lut->points[i - 1][0])));
+    const int upper = AOMMIN(solver->num_bins - 1,
+                             (int)ceil(noise_strength_solver_get_bin_index(
+                                 solver, lut->points[i + 1][0])));
+    double r = 0;
+    for (int j = lower; j <= upper; ++j) {
+      const double x = aom_noise_strength_solver_get_center(solver, j);
+      if (x < lut->points[i - 1][0]) continue;
+      if (x >= lut->points[i + 1][0]) continue;
+      const double y = solver->eqns.x[j];
+      const double a = (x - lut->points[i - 1][0]) /
+                       (lut->points[i + 1][0] - lut->points[i - 1][0]);
+      const double estimate_y =
+          lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
+      r += fabs(y - estimate_y);
+    }
+    residual[i] = r * dx;
+  }
+}
+
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_output_points,
+    aom_noise_strength_lut_t *lut) {
+  // The tolerance is normalized to be give consistent results between
+  // different bit-depths.
+  const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
+  if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
+    fprintf(stderr, "Failed to init lut\n");
+    return 0;
+  }
+  for (int i = 0; i < solver->num_bins; ++i) {
+    lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
+    lut->points[i][1] = solver->eqns.x[i];
+  }
+  if (max_output_points < 0) {
+    max_output_points = solver->num_bins;
+  }
+
+  double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
+  memset(residual, 0, sizeof(*residual) * solver->num_bins);
+
+  update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
+
+  // Greedily remove points if there are too many or if it doesn't hurt local
+  // approximation (never remove the end points)
+  while (lut->num_points > 2) {
+    int min_index = 1;
+    for (int j = 1; j < lut->num_points - 1; ++j) {
+      if (residual[j] < residual[min_index]) {
+        min_index = j;
+      }
+    }
+    const double dx =
+        lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
+    const double avg_residual = residual[min_index] / dx;
+    if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
+      break;
+    }
+
+    const int num_remaining = lut->num_points - min_index - 1;
+    memmove(lut->points + min_index, lut->points + min_index + 1,
+            sizeof(lut->points[0]) * num_remaining);
+    lut->num_points--;
+
+    update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
+                                     min_index + 1);
+  }
+  aom_free(residual);
+  return 1;
+}
+
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd) {
+  const int n = block_size * block_size;
+  aom_equation_system_t eqns;
+  double *AtA_inv = 0;
+  double *A = 0;
+  int x = 0, y = 0, i = 0, j = 0;
+  if (!equation_system_init(&eqns, kLowPolyNumParams)) {
+    fprintf(stderr, "Failed to init equation system for block_size=%d\n",
+            block_size);
+    return 0;
+  }
+
+  AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
+                                 sizeof(*AtA_inv));
+  A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
+  if (AtA_inv == NULL || A == NULL) {
+    fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
+            block_size);
+    aom_free(AtA_inv);
+    aom_free(A);
+    equation_system_free(&eqns);
+    return 0;
+  }
+
+  block_finder->A = A;
+  block_finder->AtA_inv = AtA_inv;
+  block_finder->block_size = block_size;
+  block_finder->normalization = (1 << bit_depth) - 1;
+  block_finder->use_highbd = use_highbd;
+
+  for (y = 0; y < block_size; ++y) {
+    const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
+    for (x = 0; x < block_size; ++x) {
+      const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
+      const double coords[3] = { yd, xd, 1 };
+      const int row = y * block_size + x;
+      A[kLowPolyNumParams * row + 0] = yd;
+      A[kLowPolyNumParams * row + 1] = xd;
+      A[kLowPolyNumParams * row + 2] = 1;
+
+      for (i = 0; i < kLowPolyNumParams; ++i) {
+        for (j = 0; j < kLowPolyNumParams; ++j) {
+          eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
+        }
+      }
+    }
+  }
+
+  // Lazy inverse using existing equation solver.
+  for (i = 0; i < kLowPolyNumParams; ++i) {
+    memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
+    eqns.b[i] = 1;
+    equation_system_solve(&eqns);
+
+    for (j = 0; j < kLowPolyNumParams; ++j) {
+      AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
+    }
+  }
+  equation_system_free(&eqns);
+  return 1;
+}
+
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
+  if (!block_finder) return;
+  aom_free(block_finder->A);
+  aom_free(block_finder->AtA_inv);
+  memset(block_finder, 0, sizeof(*block_finder));
+}
+
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block) {
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double *A = block_finder->A;
+  const double *AtA_inv = block_finder->AtA_inv;
+  double plane_coords[kLowPolyNumParams];
+  double AtA_inv_b[kLowPolyNumParams];
+  int xi, yi, i;
+
+  if (block_finder->use_highbd) {
+    const uint16_t *const data16 = (const uint16_t *const)data;
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data16[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  } else {
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  }
+  multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
+  multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
+               kLowPolyNumParams, 1);
+  multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
+
+  for (i = 0; i < n; ++i) {
+    block[i] -= plane[i];
+  }
+}
+
+typedef struct {
+  int index;
+  float score;
+} index_and_score_t;
+
+static int compare_scores(const void *a, const void *b) {
+  const float diff =
+      ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks) {
+  // The gradient-based features used in this code are based on:
+  //  A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
+  //  correlation for improved video denoising," 2012 19th, ICIP.
+  // The thresholds are more lenient to allow for correct grain modeling
+  // if extreme cases.
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double kTraceThreshold = 0.15 / (32 * 32);
+  const double kRatioThreshold = 1.25;
+  const double kNormThreshold = 0.08 / (32 * 32);
+  const double kVarThreshold = 0.005 / (double)n;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int num_flat = 0;
+  int bx = 0, by = 0;
+  double *plane = (double *)aom_malloc(n * sizeof(*plane));
+  double *block = (double *)aom_malloc(n * sizeof(*block));
+  index_and_score_t *scores = (index_and_score_t *)aom_malloc(
+      num_blocks_w * num_blocks_h * sizeof(*scores));
+  if (plane == NULL || block == NULL || scores == NULL) {
+    fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
+    aom_free(plane);
+    aom_free(block);
+    aom_free(scores);
+    return -1;
+  }
+
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "score = [");
+#endif
+  for (by = 0; by < num_blocks_h; ++by) {
+    for (bx = 0; bx < num_blocks_w; ++bx) {
+      // Compute gradient covariance matrix.
+      double Gxx = 0, Gxy = 0, Gyy = 0;
+      double var = 0;
+      double mean = 0;
+      int xi, yi;
+      aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
+                                          bx * block_size, by * block_size,
+                                          plane, block);
+
+      for (yi = 1; yi < block_size - 1; ++yi) {
+        for (xi = 1; xi < block_size - 1; ++xi) {
+          const double gx = (block[yi * block_size + xi + 1] -
+                             block[yi * block_size + xi - 1]) /
+                            2;
+          const double gy = (block[yi * block_size + xi + block_size] -
+                             block[yi * block_size + xi - block_size]) /
+                            2;
+          Gxx += gx * gx;
+          Gxy += gx * gy;
+          Gyy += gy * gy;
+
+          mean += block[yi * block_size + xi];
+          var += block[yi * block_size + xi] * block[yi * block_size + xi];
+        }
+      }
+      mean /= (block_size - 2) * (block_size - 2);
+
+      // Normalize gradients by block_size.
+      Gxx /= ((block_size - 2) * (block_size - 2));
+      Gxy /= ((block_size - 2) * (block_size - 2));
+      Gyy /= ((block_size - 2) * (block_size - 2));
+      var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
+
+      {
+        const double trace = Gxx + Gyy;
+        const double det = Gxx * Gyy - Gxy * Gxy;
+        const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
+        const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
+        const double norm = e1;  // Spectral norm
+        const double ratio = (e1 / AOMMAX(e2, 1e-6));
+        const int is_flat = (trace < kTraceThreshold) &&
+                            (ratio < kRatioThreshold) &&
+                            (norm < kNormThreshold) && (var > kVarThreshold);
+        // The following weights are used to combine the above features to give
+        // a sigmoid score for flatness. If the input was normalized to [0,100]
+        // the magnitude of these values would be close to 1 (e.g., weights
+        // corresponding to variance would be a factor of 10000x smaller).
+        // The weights are given in the following order:
+        //    [{var}, {ratio}, {trace}, {norm}, offset]
+        // with one of the most discriminative being simply the variance.
+        const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
+        const float score =
+            (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio +
+                                     weights[2] * trace + weights[3] * norm +
+                                     weights[4]))));
+        flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
+        scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
+        scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
+#ifdef NOISE_MODEL_LOG_SCORE
+        fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
+                is_flat);
+#endif
+        num_flat += is_flat;
+      }
+    }
+#ifdef NOISE_MODEL_LOG_SCORE
+    fprintf(stderr, "\n");
+#endif
+  }
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "];\n");
+#endif
+  // Find the top-scored blocks (most likely to be flat) and set the flat blocks
+  // be the union of the thresholded results and the top 10th percentile of the
+  // scored results.
+  qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
+  const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
+  const float score_threshold = scores[top_nth_percentile].score;
+  for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
+    if (scores[i].score >= score_threshold) {
+      num_flat += flat_blocks[scores[i].index] == 0;
+      flat_blocks[scores[i].index] |= 1;
+    }
+  }
+  aom_free(block);
+  aom_free(plane);
+  aom_free(scores);
+  return num_flat;
+}
+
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params) {
+  const int n = num_coeffs(params);
+  const int lag = params.lag;
+  const int bit_depth = params.bit_depth;
+  int x = 0, y = 0, i = 0, c = 0;
+
+  memset(model, 0, sizeof(*model));
+  if (params.lag < 1) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
+    return 0;
+  }
+  if (params.lag > kMaxLag) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
+            kMaxLag);
+    return 0;
+  }
+
+  memcpy(&model->params, &params, sizeof(params));
+  for (c = 0; c < 3; ++c) {
+    if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+    if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+  }
+  model->n = n;
+  model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+
+  for (y = -lag; y <= 0; ++y) {
+    const int max_x = y == 0 ? -1 : lag;
+    for (x = -lag; x <= max_x; ++x) {
+      switch (params.shape) {
+        case AOM_NOISE_SHAPE_DIAMOND:
+          if (abs(x) <= y + lag) {
+            model->coords[i][0] = x;
+            model->coords[i][1] = y;
+            ++i;
+          }
+          break;
+        case AOM_NOISE_SHAPE_SQUARE:
+          model->coords[i][0] = x;
+          model->coords[i][1] = y;
+          ++i;
+          break;
+        default:
+          fprintf(stderr, "Invalid shape\n");
+          aom_noise_model_free(model);
+          return 0;
+      }
+    }
+  }
+  assert(i == n);
+  return 1;
+}
+
+void aom_noise_model_free(aom_noise_model_t *model) {
+  int c = 0;
+  if (!model) return;
+
+  aom_free(model->coords);
+  for (c = 0; c < 3; ++c) {
+    equation_system_free(&model->latest_state[c].eqns);
+    equation_system_free(&model->combined_state[c].eqns);
+
+    equation_system_free(&model->latest_state[c].strength_solver.eqns);
+    equation_system_free(&model->combined_state[c].strength_solver.eqns);
+  }
+  memset(model, 0, sizeof(*model));
+}
+
+// Extracts the neighborhood defined by coords around point (x, y) from
+// the difference between the data and denoised images. Also extracts the
+// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
+#define EXTRACT_AR_ROW(INT_TYPE, suffix)                                   \
+  static double extract_ar_row_##suffix(                                   \
+      int(*coords)[2], int num_coords, const INT_TYPE *const data,         \
+      const INT_TYPE *const denoised, int stride, int sub_log2[2],         \
+      const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised,  \
+      int alt_stride, int x, int y, double *buffer) {                      \
+    for (int i = 0; i < num_coords; ++i) {                                 \
+      const int x_i = x + coords[i][0], y_i = y + coords[i][1];            \
+      buffer[i] =                                                          \
+          (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
+    }                                                                      \
+    const double val =                                                     \
+        (double)data[y * stride + x] - denoised[y * stride + x];           \
+                                                                           \
+    if (alt_data && alt_denoised) {                                        \
+      double avg_data = 0, avg_denoised = 0;                               \
+      int num_samples = 0;                                                 \
+      for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) {              \
+        const int y_up = (y << sub_log2[1]) + dy_i;                        \
+        for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) {            \
+          const int x_up = (x << sub_log2[0]) + dx_i;                      \
+          avg_data += alt_data[y_up * alt_stride + x_up];                  \
+          avg_denoised += alt_denoised[y_up * alt_stride + x_up];          \
+          num_samples++;                                                   \
+        }                                                                  \
+      }                                                                    \
+      buffer[num_coords] = (avg_data - avg_denoised) / num_samples;        \
+    }                                                                      \
+    return val;                                                            \
+  }
+
+EXTRACT_AR_ROW(uint8_t, lowbd);
+EXTRACT_AR_ROW(uint16_t, highbd);
+
+static int add_block_observations(
+    aom_noise_model_t *noise_model, int c, const uint8_t *const data,
+    const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
+    const uint8_t *const alt_data, const uint8_t *const alt_denoised,
+    int alt_stride, const uint8_t *const flat_blocks, int block_size,
+    int num_blocks_w, int num_blocks_h) {
+  const int lag = noise_model->params.lag;
+  const int num_coords = noise_model->n;
+  const double normalization = (1 << noise_model->params.bit_depth) - 1;
+  double *A = noise_model->latest_state[c].eqns.A;
+  double *b = noise_model->latest_state[c].eqns.b;
+  double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
+  const int n = noise_model->latest_state[c].eqns.n;
+
+  if (!buffer) {
+    fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
+    return 0;
+  }
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      int y_start =
+          (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
+      int x_start =
+          (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
+      int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                         block_size >> sub_log2[1]);
+      int x_end = AOMMIN(
+          (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
+          (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
+              ? (block_size >> sub_log2[0])
+              : ((block_size >> sub_log2[0]) - lag));
+      for (int y = y_start; y < y_end; ++y) {
+        for (int x = x_start; x < x_end; ++x) {
+          const double val =
+              noise_model->params.use_highbd
+                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
+                                          (const uint16_t *const)data,
+                                          (const uint16_t *const)denoised,
+                                          stride, sub_log2,
+                                          (const uint16_t *const)alt_data,
+                                          (const uint16_t *const)alt_denoised,
+                                          alt_stride, x + x_o, y + y_o, buffer)
+                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
+                                         denoised, stride, sub_log2, alt_data,
+                                         alt_denoised, alt_stride, x + x_o,
+                                         y + y_o, buffer);
+          for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < n; ++j) {
+              A[i * n + j] +=
+                  (buffer[i] * buffer[j]) / (normalization * normalization);
+            }
+            b[i] += (buffer[i] * val) / (normalization * normalization);
+          }
+          noise_model->latest_state[c].num_observations++;
+        }
+      }
+    }
+  }
+  aom_free(buffer);
+  return 1;
+}
+
+static void add_noise_std_observations(
+    aom_noise_model_t *noise_model, int c, const double *coeffs,
+    const uint8_t *const data, const uint8_t *const denoised, int w, int h,
+    int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
+    const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
+    int num_blocks_h) {
+  const int num_coords = noise_model->n;
+  aom_noise_strength_solver_t *noise_strength_solver =
+      &noise_model->latest_state[c].strength_solver;
+
+  const aom_noise_strength_solver_t *noise_strength_luma =
+      &noise_model->latest_state[0].strength_solver;
+  const double luma_gain = noise_model->latest_state[0].ar_gain;
+  const double noise_gain = noise_model->latest_state[c].ar_gain;
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      const int num_samples_h =
+          AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                 block_size >> sub_log2[1]);
+      const int num_samples_w =
+          AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
+                 (block_size >> sub_log2[0]));
+      // Make sure that we have a reasonable amount of samples to consider the
+      // block
+      if (num_samples_w * num_samples_h > block_size) {
+        const double block_mean = get_block_mean(
+            alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
+            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
+            noise_model->params.use_highbd);
+        const double noise_var = get_noise_var(
+            data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
+            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
+            noise_model->params.use_highbd);
+        // We want to remove the part of the noise that came from being
+        // correlated with luma. Note that the noise solver for luma must
+        // have already been run.
+        const double luma_strength =
+            c > 0 ? luma_gain * noise_strength_solver_get_value(
+                                    noise_strength_luma, block_mean)
+                  : 0;
+        const double corr = c > 0 ? coeffs[num_coords] : 0;
+        // Chroma noise:
+        //    N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
+        // The uncorrelated component:
+        //   uncorr_var = noise_var - (corr * luma_strength)^2
+        // But don't allow fully correlated noise (hence the max), since the
+        // synthesis cannot model it.
+        const double uncorr_std = sqrt(
+            AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
+        // After we've removed correlation with luma, undo the gain that will
+        // come from running the IIR filter.
+        const double adjusted_strength = uncorr_std / noise_gain;
+        aom_noise_strength_solver_add_measurement(
+            noise_strength_solver, block_mean, adjusted_strength);
+      }
+    }
+  }
+}
+
+// Return true if the noise estimate appears to be different from the combined
+// (multi-frame) estimate. The difference is measured by checking whether the
+// AR coefficients have diverged (using a threshold on normalized cross
+// correlation), or whether the noise strength has changed.
+static int is_noise_model_different(aom_noise_model_t *const noise_model) {
+  // These thresholds are kind of arbitrary and will likely need further tuning
+  // (or exported as parameters). The threshold on noise strength is a weighted
+  // difference between the noise strength histograms
+  const double kCoeffThreshold = 0.9;
+  const double kStrengthThreshold =
+      0.005 * (1 << (noise_model->params.bit_depth - 8));
+  for (int c = 0; c < 1; ++c) {
+    const double corr =
+        aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.n);
+    if (corr < kCoeffThreshold) return 1;
+
+    const double dx =
+        1.0 / noise_model->latest_state[c].strength_solver.num_bins;
+
+    const aom_equation_system_t *latest_eqns =
+        &noise_model->latest_state[c].strength_solver.eqns;
+    const aom_equation_system_t *combined_eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    double diff = 0;
+    double total_weight = 0;
+    for (int j = 0; j < latest_eqns->n; ++j) {
+      double weight = 0;
+      for (int i = 0; i < latest_eqns->n; ++i) {
+        weight += latest_eqns->A[i * latest_eqns->n + j];
+      }
+      weight = sqrt(weight);
+      diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
+      total_weight += weight;
+    }
+    if (diff * dx / total_weight > kStrengthThreshold) return 1;
+  }
+  return 0;
+}
+
+static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
+  const int ret = equation_system_solve(&state->eqns);
+  state->ar_gain = 1.0;
+  if (!ret) return ret;
+
+  // Update the AR gain from the equation system as it will be used to fit
+  // the noise strength as a function of intensity.  In the Yule-Walker
+  // equations, the diagonal should be the variance of the correlated noise.
+  // In the case of the least squares estimate, there will be some variability
+  // in the diagonal. So use the mean of the diagonal as the estimate of
+  // overall variance (this works for least squares or Yule-Walker formulation).
+  double var = 0;
+  const int n = state->eqns.n;
+  for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
+    var += state->eqns.A[i * n + i] / state->num_observations;
+  }
+  var /= (n - is_chroma);
+
+  // Keep track of E(Y^2) = <b, x> + E(X^2)
+  // In the case that we are using chroma and have an estimate of correlation
+  // with luma we adjust that estimate slightly to remove the correlated bits by
+  // subtracting out the last column of a scaled by our correlation estimate
+  // from b. E(y^2) = <b - A(:, end)*x(end), x>
+  double sum_covar = 0;
+  for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
+    double bi = state->eqns.b[i];
+    if (is_chroma) {
+      bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
+    }
+    sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
+  }
+  // Now, get an estimate of the variance of uncorrelated noise signal and use
+  // it to determine the gain of the AR filter.
+  const double noise_var = AOMMAX(var - sum_covar, 1e-6);
+  state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
+  return ret;
+}
+
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int stride[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int y_model_different = 0;
+  int num_blocks = 0;
+  int i = 0, channel = 0;
+
+  if (block_size <= 1) {
+    fprintf(stderr, "block_size = %d must be > 1\n", block_size);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  if (block_size < noise_model->params.lag * 2 + 1) {
+    fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
+            noise_model->params.lag * 2 + 1);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  // Clear the latest equation system
+  for (i = 0; i < 3; ++i) {
+    equation_system_clear(&noise_model->latest_state[i].eqns);
+    noise_model->latest_state[i].num_observations = 0;
+    noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
+  }
+
+  // Check that we have enough flat blocks
+  for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
+    if (flat_blocks[i]) {
+      num_blocks++;
+    }
+  }
+
+  if (num_blocks <= 1) {
+    fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
+    return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
+  }
+
+  for (channel = 0; channel < 3; ++channel) {
+    int no_subsampling[2] = { 0, 0 };
+    const uint8_t *alt_data = channel > 0 ? data[0] : 0;
+    const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
+    int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
+    const int is_chroma = channel != 0;
+    if (!data[channel] || !denoised[channel]) break;
+    if (!add_block_observations(noise_model, channel, data[channel],
+                                denoised[channel], w, h, stride[channel], sub,
+                                alt_data, alt_denoised, stride[0], flat_blocks,
+                                block_size, num_blocks_w, num_blocks_h)) {
+      fprintf(stderr, "Adding block observation failed\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    if (!ar_equation_system_solve(&noise_model->latest_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->latest_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving latest noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    add_noise_std_observations(
+        noise_model, channel, noise_model->latest_state[channel].eqns.x,
+        data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
+        stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->latest_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving latest noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    // Check noise characteristics and return if error.
+    if (channel == 0 &&
+        noise_model->combined_state[channel].strength_solver.num_equations >
+            0 &&
+        is_noise_model_different(noise_model)) {
+      y_model_different = 1;
+    }
+
+    // Don't update the combined stats if the y model is different.
+    if (y_model_different) continue;
+
+    noise_model->combined_state[channel].num_observations +=
+        noise_model->latest_state[channel].num_observations;
+    equation_system_add(&noise_model->combined_state[channel].eqns,
+                        &noise_model->latest_state[channel].eqns);
+    if (!ar_equation_system_solve(&noise_model->combined_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->combined_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving combined noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    noise_strength_solver_add(
+        &noise_model->combined_state[channel].strength_solver,
+        &noise_model->latest_state[channel].strength_solver);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->combined_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving combined noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+  }
+
+  return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
+                           : AOM_NOISE_STATUS_OK;
+}
+
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
+  for (int c = 0; c < 3; c++) {
+    equation_system_copy(&noise_model->combined_state[c].eqns,
+                         &noise_model->latest_state[c].eqns);
+    equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
+                         &noise_model->latest_state[c].strength_solver.eqns);
+    noise_model->combined_state[c].strength_solver.num_equations =
+        noise_model->latest_state[c].strength_solver.num_equations;
+    noise_model->combined_state[c].num_observations =
+        noise_model->latest_state[c].num_observations;
+    noise_model->combined_state[c].ar_gain =
+        noise_model->latest_state[c].ar_gain;
+  }
+}
+
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain) {
+  if (noise_model->params.lag > 3) {
+    fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
+    return 0;
+  }
+  memset(film_grain, 0, sizeof(*film_grain));
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+
+  film_grain->ar_coeff_lag = noise_model->params.lag;
+
+  // Convert the scaling functions to 8 bit values
+  aom_noise_strength_lut_t scaling_points[3];
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+
+  // Both the domain and the range of the scaling functions in the film_grain
+  // are normalized to 8-bit (e.g., they are implicitly scaled during grain
+  // synthesis).
+  const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
+  double max_scaling_value = 1e-4;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      scaling_points[c].points[i][0] =
+          AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
+      scaling_points[c].points[i][1] =
+          AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
+      max_scaling_value =
+          AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
+    }
+  }
+
+  // Scaling_shift values are in the range [8,11]
+  const int max_scaling_value_log2 =
+      clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
+  film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
+
+  const double scale_factor = 1 << (8 - max_scaling_value_log2);
+  film_grain->num_y_points = scaling_points[0].num_points;
+  film_grain->num_cb_points = scaling_points[1].num_points;
+  film_grain->num_cr_points = scaling_points[2].num_points;
+
+  int(*film_grain_scaling[3])[2] = {
+    film_grain->scaling_points_y,
+    film_grain->scaling_points_cb,
+    film_grain->scaling_points_cr,
+  };
+  for (int c = 0; c < 3; c++) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
+      film_grain_scaling[c][i][1] = clamp(
+          (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
+    }
+  }
+  aom_noise_strength_lut_free(scaling_points + 0);
+  aom_noise_strength_lut_free(scaling_points + 1);
+  aom_noise_strength_lut_free(scaling_points + 2);
+
+  // Convert the ar_coeffs into 8-bit values
+  const int n_coeff = noise_model->combined_state[0].eqns.n;
+  double max_coeff = 1e-4, min_coeff = -1e-4;
+  double y_corr[2] = { 0, 0 };
+  double avg_luma_strength = 0;
+  for (int c = 0; c < 3; c++) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      max_coeff = AOMMAX(max_coeff, eqns->x[i]);
+      min_coeff = AOMMIN(min_coeff, eqns->x[i]);
+    }
+    // Since the correlation between luma/chroma was computed in an already
+    // scaled space, we adjust it in the un-scaled space.
+    aom_noise_strength_solver_t *solver =
+        &noise_model->combined_state[c].strength_solver;
+    // Compute a weighted average of the strength for the channel.
+    double average_strength = 0, total_weight = 0;
+    for (int i = 0; i < solver->eqns.n; ++i) {
+      double w = 0;
+      for (int j = 0; j < solver->eqns.n; ++j) {
+        w += solver->eqns.A[i * solver->eqns.n + j];
+      }
+      w = sqrt(w);
+      average_strength += solver->eqns.x[i] * w;
+      total_weight += w;
+    }
+    if (total_weight == 0)
+      average_strength = 1;
+    else
+      average_strength /= total_weight;
+    if (c == 0) {
+      avg_luma_strength = average_strength;
+    } else {
+      y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
+      max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
+      min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
+    }
+  }
+  // Shift value: AR coeffs range (values 6-9)
+  // 6: [-2, 2),  7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
+  film_grain->ar_coeff_shift =
+      clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
+            6, 9);
+  double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
+  int *ar_coeffs[3] = {
+    film_grain->ar_coeffs_y,
+    film_grain->ar_coeffs_cb,
+    film_grain->ar_coeffs_cr,
+  };
+  for (int c = 0; c < 3; ++c) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      ar_coeffs[c][i] =
+          clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
+    }
+    if (c > 0) {
+      ar_coeffs[c][n_coeff] =
+          clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
+    }
+  }
+
+  // At the moment, the noise modeling code assumes that the chroma scaling
+  // functions are a function of luma.
+  film_grain->cb_mult = 128;       // 8 bits
+  film_grain->cb_luma_mult = 192;  // 8 bits
+  film_grain->cb_offset = 256;     // 9 bits
+
+  film_grain->cr_mult = 128;       // 8 bits
+  film_grain->cr_luma_mult = 192;  // 8 bits
+  film_grain->cr_offset = 256;     // 9 bits
+
+  film_grain->chroma_scaling_from_luma = 0;
+  film_grain->grain_scale_shift = 0;
+  film_grain->overlap_flag = 1;
+  return 1;
+}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+  for (int i = 0; i < n; ++i) {
+    b[i] *= a[i];
+  }
+}
+
+static float *get_half_cos_window(int block_size) {
+  float *window_function =
+      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  for (int y = 0; y < block_size; ++y) {
+    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+    for (int x = 0; x < block_size; ++x) {
+      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+    }
+  }
+  return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
+  static void dither_and_quantize_##suffix(                                 \
+      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
+      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
+      float block_normalization) {                                          \
+    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
+      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
+        const int result_idx =                                              \
+            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
+            (block_size >> chroma_sub_w);                                   \
+        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
+            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
+            block_normalization);                                           \
+        const float err =                                                   \
+            -(((float)new_val) / block_normalization - result[result_idx]); \
+        denoised[y * stride + x] = new_val;                                 \
+        if (x + 1 < (w >> chroma_sub_w)) {                                  \
+          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
+        }                                                                   \
+        if (y + 1 < (h >> chroma_sub_h)) {                                  \
+          if (x > 0) {                                                      \
+            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
+          }                                                                 \
+          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
+          if (x + 1 < (w >> chroma_sub_w)) {                                \
+            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd) {
+  float *plane = NULL, *block = NULL, *window_full = NULL,
+        *window_chroma = NULL;
+  double *block_d = NULL, *plane_d = NULL;
+  struct aom_noise_tx_t *tx_full = NULL;
+  struct aom_noise_tx_t *tx_chroma = NULL;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  const int result_stride = (num_blocks_w + 2) * block_size;
+  const int result_height = (num_blocks_h + 2) * block_size;
+  float *result = NULL;
+  int init_success = 1;
+  aom_flat_block_finder_t block_finder_full;
+  aom_flat_block_finder_t block_finder_chroma;
+  const float kBlockNormalization = (float)((1 << bit_depth) - 1);
+  if (chroma_sub[0] != chroma_sub[1]) {
+    fprintf(stderr,
+            "aom_wiener_denoise_2d doesn't handle different chroma "
+            "subsampling");
+    return 0;
+  }
+  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+                                             bit_depth, use_highbd);
+  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+                               sizeof(*result));
+  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+  block =
+      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+  window_full = get_half_cos_window(block_size);
+  tx_full = aom_noise_tx_malloc(block_size);
+
+  if (chroma_sub[0] != 0) {
+    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+                                               block_size >> chroma_sub[0],
+                                               bit_depth, use_highbd);
+    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+  } else {
+    window_chroma = window_full;
+    tx_chroma = tx_full;
+  }
+
+  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+                  (window_full != NULL) && (window_chroma != NULL) &&
+                  (result != NULL);
+  for (int c = init_success ? 0 : 3; c < 3; ++c) {
+    float *window_function = c == 0 ? window_full : window_chroma;
+    aom_flat_block_finder_t *block_finder = &block_finder_full;
+    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+    struct aom_noise_tx_t *tx =
+        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+    if (!data[c] || !denoised[c]) continue;
+    if (c > 0 && chroma_sub[0] != 0) {
+      block_finder = &block_finder_chroma;
+    }
+    memset(result, 0, sizeof(*result) * result_stride * result_height);
+    // Do overlapped block processing (half overlapped). The block rows can
+    // easily be done in parallel
+    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+         offsy += (block_size >> chroma_sub_h) / 2) {
+      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+           offsx += (block_size >> chroma_sub_w) / 2) {
+        // Pad the boundary when processing each block-set.
+        for (int by = -1; by < num_blocks_h; ++by) {
+          for (int bx = -1; bx < num_blocks_w; ++bx) {
+            const int pixels_per_block =
+                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+            aom_flat_block_finder_extract_block(
+                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+            for (int j = 0; j < pixels_per_block; ++j) {
+              block[j] = (float)block_d[j];
+              plane[j] = (float)plane_d[j];
+            }
+            pointwise_multiply(window_function, block, pixels_per_block);
+            aom_noise_tx_forward(tx, block);
+            aom_noise_tx_filter(tx, noise_psd[c]);
+            aom_noise_tx_inverse(tx, block);
+
+            // Apply window function to the plane approximation (we will apply
+            // it to the sum of plane + block when composing the results).
+            pointwise_multiply(window_function, plane, pixels_per_block);
+
+            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+              const int y_result =
+                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+                const int x_result =
+                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+                result[y_result * result_stride + x_result] +=
+                    (block[y * (block_size >> chroma_sub_w) + x] +
+                     plane[y * (block_size >> chroma_sub_w) + x]) *
+                    window_function[y * (block_size >> chroma_sub_w) + x];
+              }
+            }
+          }
+        }
+      }
+    }
+    if (use_highbd) {
+      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                                 block_size, kBlockNormalization);
+    } else {
+      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+                                stride[c], chroma_sub_w, chroma_sub_h,
+                                block_size, kBlockNormalization);
+    }
+  }
+  aom_free(result);
+  aom_free(plane);
+  aom_free(block);
+  aom_free(plane_d);
+  aom_free(block_d);
+  aom_free(window_full);
+
+  aom_noise_tx_free(tx_full);
+
+  aom_flat_block_finder_free(&block_finder_full);
+  if (chroma_sub[0] != 0) {
+    aom_flat_block_finder_free(&block_finder_chroma);
+    aom_free(window_chroma);
+    aom_noise_tx_free(tx_chroma);
+  }
+  return init_success;
+}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
new file mode 100644
index 000000000..dabeacc14
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_NOISE_MODEL_H_
+#define AOM_DSP_NOISE_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include <stdint.h>
+#include "aom_dsp/grain_synthesis.h"
+
+/*!\brief Wrapper of data required to represent linear system of eqns and soln.
+ */
+typedef struct {
+  double *A;
+  double *b;
+  double *x;
+  int n;
+} aom_equation_system_t;
+
+/*!\brief Representation of a piecewise linear curve
+ *
+ * Holds n points as (x, y) pairs, that store the curve.
+ */
+typedef struct {
+  double (*points)[2];
+  int num_points;
+} aom_noise_strength_lut_t;
+
+/*!\brief Init the noise strength lut with the given number of points*/
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
+
+/*!\brief Frees the noise strength lut. */
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
+
+/*!\brief Evaluate the lut at the point x.
+ *
+ * \param[in] lut  The lut data.
+ * \param[in] x    The coordinate to evaluate the lut.
+ */
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x);
+
+/*!\brief Helper struct to model noise strength as a function of intensity.
+ *
+ * Internally, this structure holds a representation of a linear system
+ * of equations that models noise strength (standard deviation) as a
+ * function of intensity. The mapping is initially stored using a
+ * piecewise representation with evenly spaced bins that cover the entire
+ * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
+ * constraint of the form:
+ *   y_{i} (1 - a) + y_{i+1} a = y
+ * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
+ * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
+ * normal equations.
+ *
+ * As there may be missing data, the solution is regularized to get a
+ * complete set of values for the bins. A reduced representation after
+ * solving can be obtained by getting the corresponding noise_strength_lut_t.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  double min_intensity;
+  double max_intensity;
+  int num_bins;
+  int num_equations;
+  double total;
+} aom_noise_strength_solver_t;
+
+/*!\brief Initializes the noise solver with the given number of bins.
+ *
+ * Returns 0 if initialization fails.
+ *
+ * \param[in]  solver    The noise solver to be initialized.
+ * \param[in]  num_bins  Number of bins to use in the internal representation.
+ * \param[in]  bit_depth The bit depth used to derive {min,max}_intensity.
+ */
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth);
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
+
+/*!\brief Gets the x coordinate of bin i.
+ *
+ * \param[in]  i  The bin whose coordinate to query.
+ */
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i);
+
+/*!\brief Add an observation of the block mean intensity to its noise strength.
+ *
+ * \param[in]  block_mean  The average block intensity,
+ * \param[in]  noise_std   The observed noise strength.
+ */
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
+
+/*!\brief Solves the current set of equations for the noise strength. */
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
+
+/*!\brief Fits a reduced piecewise linear lut to the internal solution
+ *
+ * \param[in] max_num_points  The maximum number of output points
+ * \param[out] lut  The output piecewise linear lut.
+ */
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_num_points,
+    aom_noise_strength_lut_t *lut);
+
+/*!\brief Helper for holding precomputed data for finding flat blocks.
+ *
+ * Internally a block is modeled with a low-order polynomial model. A
+ * planar model would be a bunch of equations like:
+ * <[y_i x_i 1], [a_1, a_2, a_3]>  = b_i
+ * for each point in the block. The system matrix A with row i as [y_i x_i 1]
+ * is maintained as is the inverse, inv(A'*A), so that the plane parameters
+ * can be fit for each block.
+ */
+typedef struct {
+  double *AtA_inv;
+  double *A;
+  int num_params;  // The number of parameters used for internal low-order model
+  int block_size;  // The block size the finder was initialized with
+  double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
+  int use_highbd;        // Whether input data should be interpreted as uint16
+} aom_flat_block_finder_t;
+
+/*!\brief Init the block_finder with the given block size, bit_depth */
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd);
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
+
+/*!\brief Helper to extract a block and low order "planar" model. */
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block);
+
+/*!\brief Runs the flat block finder on the input data.
+ *
+ * Find flat blocks in the input image data. Returns a map of
+ * flat_blocks, where the value of flat_blocks map will be non-zero
+ * when a block is determined to be flat. A higher value indicates a bigger
+ * confidence in the decision.
+ */
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks);
+
+// The noise shape indicates the allowed coefficients in the AR model.
+typedef enum {
+  AOM_NOISE_SHAPE_DIAMOND = 0,
+  AOM_NOISE_SHAPE_SQUARE = 1
+} aom_noise_shape;
+
+// The parameters of the noise model include the shape type, lag, the
+// bit depth of the input images provided, and whether the input images
+// will be using uint16 (or uint8) representation.
+typedef struct {
+  aom_noise_shape shape;
+  int lag;
+  int bit_depth;
+  int use_highbd;
+} aom_noise_model_params_t;
+
+/*!\brief State of a noise model estimate for a single channel.
+ *
+ * This contains a system of equations that can be used to solve
+ * for the auto-regressive coefficients as well as a noise strength
+ * solver that can be used to model noise strength as a function of
+ * intensity.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  aom_noise_strength_solver_t strength_solver;
+  int num_observations;  // The number of observations in the eqn system
+  double ar_gain;        // The gain of the current AR filter
+} aom_noise_state_t;
+
+/*!\brief Complete model of noise for a planar video
+ *
+ * This includes a noise model for the latest frame and an aggregated
+ * estimate over all previous frames that had similar parameters.
+ */
+typedef struct {
+  aom_noise_model_params_t params;
+  aom_noise_state_t combined_state[3];  // Combined state per channel
+  aom_noise_state_t latest_state[3];    // Latest state per channel
+  int (*coords)[2];  // Offsets (x,y) of the coefficient samples
+  int n;             // Number of parameters (size of coords)
+  int bit_depth;
+} aom_noise_model_t;
+
+/*!\brief Result of a noise model update. */
+typedef enum {
+  AOM_NOISE_STATUS_OK = 0,
+  AOM_NOISE_STATUS_INVALID_ARGUMENT,
+  AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+  AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
+  AOM_NOISE_STATUS_INTERNAL_ERROR,
+} aom_noise_status_t;
+
+/*!\brief Initializes a noise model with the given parameters.
+ *
+ * Returns 0 on failure.
+ */
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params);
+void aom_noise_model_free(aom_noise_model_t *model);
+
+/*!\brief Updates the noise model with a new frame observation.
+ *
+ * Updates the noise model with measurements from the given input frame and a
+ * denoised variant of it. Noise is sampled from flat blocks using the flat
+ * block map.
+ *
+ * Returns a noise_status indicating if the update was successful. If the
+ * Update was successful, the combined_state is updated with measurements from
+ * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
+ * state will be updated with measurements from the provided frame.
+ *
+ * \param[in,out] noise_model     The noise model to be updated
+ * \param[in]     data            Raw frame data
+ * \param[in]     denoised        Denoised frame data.
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     strides         Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     flat_blocks     A map to blocks that have been determined flat
+ * \param[in]     block_size      The size of blocks.
+ */
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int strides[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
+
+/*\brief Save the "latest" estimate into the "combined" estimate.
+ *
+ * This is meant to be called when the noise modeling detected a change
+ * in parameters (or for example, if a user wanted to reset estimation at
+ * a shot boundary).
+ */
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
+
+/*!\brief Converts the noise_model parameters to the corresponding
+ *    grain_parameters.
+ *
+ * The noise structs in this file are suitable for estimation (e.g., using
+ * floats), but the grain parameters in the bitstream are quantized. This
+ * function does the conversion by selecting the correct quantization levels.
+ */
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain);
+
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in]     data            Raw frame data
+ * \param[out]    denoised        Denoised frame data
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     stride          Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     noise_psd       The power spectral density of the noise
+ * \param[in]     block_size      The size of blocks
+ * \param[in]     bit_depth       Bit depth of the image
+ * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
+ *                                uint16 and stride is measured in uint16.
+ *                                This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub_log2[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
new file mode 100644
index 000000000..87e8e9fec
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+  return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+  float *tx_block;
+  float *temp;
+  int block_size;
+  void (*fft)(const float *, float *, float *);
+  void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+  struct aom_noise_tx_t *noise_tx =
+      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+  if (!noise_tx) return NULL;
+  memset(noise_tx, 0, sizeof(*noise_tx));
+  switch (block_size) {
+    case 2:
+      noise_tx->fft = aom_fft2x2_float;
+      noise_tx->ifft = aom_ifft2x2_float;
+      break;
+    case 4:
+      noise_tx->fft = aom_fft4x4_float;
+      noise_tx->ifft = aom_ifft4x4_float;
+      break;
+    case 8:
+      noise_tx->fft = aom_fft8x8_float;
+      noise_tx->ifft = aom_ifft8x8_float;
+      break;
+    case 16:
+      noise_tx->fft = aom_fft16x16_float;
+      noise_tx->ifft = aom_ifft16x16_float;
+      break;
+    case 32:
+      noise_tx->fft = aom_fft32x32_float;
+      noise_tx->ifft = aom_ifft32x32_float;
+      break;
+    default:
+      aom_free(noise_tx);
+      fprintf(stderr, "Unsupported block size %d\n", block_size);
+      return NULL;
+  }
+  noise_tx->block_size = block_size;
+  noise_tx->tx_block = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  noise_tx->temp = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  if (!noise_tx->tx_block || !noise_tx->temp) {
+    aom_noise_tx_free(noise_tx);
+    return NULL;
+  }
+  // Clear the buffers up front. Some outputs of the forward transform are
+  // real only (the imaginary component will never be touched)
+  memset(noise_tx->tx_block, 0,
+         2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  memset(noise_tx->temp, 0,
+         2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+  const int block_size = noise_tx->block_size;
+  const float kBeta = 1.1f;
+  const float kEps = 1e-6f;
+  for (int y = 0; y < block_size; ++y) {
+    for (int x = 0; x < block_size; ++x) {
+      int i = y * block_size + x;
+      float *c = noise_tx->tx_block + 2 * i;
+      const float p = c[0] * c[0] + c[1] * c[1];
+      if (p > kBeta * psd[i] && p > 1e-6) {
+        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+      } else {
+        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+      }
+    }
+  }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+  const int n = noise_tx->block_size * noise_tx->block_size;
+  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+  for (int i = 0; i < n; ++i) {
+    data[i] /= n;
+  }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+                             float *psd) {
+  const int block_size = noise_tx->block_size;
+  for (int yb = 0; yb < block_size; ++yb) {
+    for (int xb = 0; xb <= block_size / 2; ++xb) {
+      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+    }
+  }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+  if (!noise_tx) return;
+  aom_free(noise_tx->tx_block);
+  aom_free(noise_tx->temp);
+  aom_free(noise_tx);
+}
+
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n) {
+  double c = 0;
+  double a_len = 0;
+  double b_len = 0;
+  for (int i = 0; i < n; ++i) {
+    a_len += a[i] * a[i];
+    b_len += b[i] * b[i];
+    c += a[i] * b[i];
+  }
+  return c / (sqrt(a_len) * sqrt(b_len));
+}
+
+int aom_noise_data_validate(const double *data, int w, int h) {
+  const double kVarianceThreshold = 2;
+  const double kMeanThreshold = 2;
+
+  int x = 0, y = 0;
+  int ret_value = 1;
+  double var = 0, mean = 0;
+  double *mean_x, *mean_y, *var_x, *var_y;
+
+  // Check that noise variance is not increasing in x or y
+  // and that the data is zero mean.
+  mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
+  var_x = (double *)aom_malloc(sizeof(*var_x) * w);
+  mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
+  var_y = (double *)aom_malloc(sizeof(*var_y) * h);
+
+  memset(mean_x, 0, sizeof(*mean_x) * w);
+  memset(var_x, 0, sizeof(*var_x) * w);
+  memset(mean_y, 0, sizeof(*mean_y) * h);
+  memset(var_y, 0, sizeof(*var_y) * h);
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const double d = data[y * w + x];
+      var_x[x] += d * d;
+      var_y[y] += d * d;
+      mean_x[x] += d;
+      mean_y[y] += d;
+      var += d * d;
+      mean += d;
+    }
+  }
+  mean /= (w * h);
+  var = var / (w * h) - mean * mean;
+
+  for (y = 0; y < h; ++y) {
+    mean_y[y] /= h;
+    var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
+    if (fabs(var_y[y] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  for (x = 0; x < w; ++x) {
+    mean_x[x] /= w;
+    var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
+    if (fabs(var_x[x] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  aom_free(mean_x);
+  aom_free(mean_y);
+  aom_free(var_x);
+  aom_free(var_y);
+
+  return ret_value;
+}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
new file mode 100644
index 000000000..ea4d9e3de
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_NOISE_UTIL_H_
+#define AOM_DSP_NOISE_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+                          const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+                             float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
+// Computes normalized cross correlation of two vectors a and b of length n.
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n);
+
+// Validates the correlated noise in the data buffer of size (w, h).
+int aom_noise_data_validate(const double *data, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c
deleted file mode 100644
index a42fb806b..000000000
--- a/third_party/aom/aom_dsp/prob.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#include <string.h>
-
-#include "aom_dsp/prob.h"
-
-static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const aom_tree_index *tree,
-                                          const aom_prob *pre_probs,
-                                          const unsigned int *counts,
-                                          aom_prob *probs) {
-  const int l = tree[i];
-  const unsigned int left_count =
-      (l <= 0) ? counts[-l]
-               : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
-  const int r = tree[i + 1];
-  const unsigned int right_count =
-      (r <= 0) ? counts[-r]
-               : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
-  const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
-  return left_count + right_count;
-}
-
-void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
-                          const unsigned int *counts, aom_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
-}
-
-typedef struct tree_node tree_node;
-
-struct tree_node {
-  aom_tree_index index;
-  uint8_t probs[16];
-  uint8_t prob;
-  int path;
-  int len;
-  int l;
-  int r;
-  aom_cdf_prob pdf;
-};
-
-/* Compute the probability of this node in Q23 */
-static uint32_t tree_node_prob(tree_node n, int i) {
-  uint32_t prob;
-  /* 1.0 in Q23 */
-  prob = 16777216;
-  for (; i < n.len; i++) {
-    prob = prob * n.probs[i] >> 8;
-  }
-  return prob;
-}
-
-static int tree_node_cmp(tree_node a, tree_node b) {
-  int i;
-  uint32_t pa;
-  uint32_t pb;
-  for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
-  }
-  pa = tree_node_prob(a, i);
-  pb = tree_node_prob(b, i);
-  return pa > pb ? 1 : pa < pb ? -1 : 0;
-}
-
-/* Given a Q15 probability for symbol subtree rooted at tree[n], this function
-    computes the probability of each symbol (defined as a node that has no
-    children). */
-static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n,
-                                            aom_cdf_prob pdf) {
-  if (tree[n].l == 0) {
-    /* This prevents probability computations in Q15 that underflow from
-        producing a symbol that has zero probability. */
-    if (pdf == 0) pdf = 1;
-    tree[n].pdf = pdf;
-    return pdf;
-  } else {
-    /* We process the smaller probability first,  */
-    if (tree[n].prob < 128) {
-      aom_cdf_prob lp;
-      aom_cdf_prob rp;
-      lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8;
-      lp = tree_node_compute_probs(tree, tree[n].l, lp);
-      rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp);
-      return lp + rp;
-    } else {
-      aom_cdf_prob rp;
-      aom_cdf_prob lp;
-      rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8;
-      rp = tree_node_compute_probs(tree, tree[n].r, rp);
-      lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp);
-      return lp + rp;
-    }
-  }
-}
-
-static int tree_node_extract(tree_node *tree, int n, int symb,
-                             aom_cdf_prob *pdf, aom_tree_index *index,
-                             int *path, int *len) {
-  if (tree[n].l == 0) {
-    pdf[symb] = tree[n].pdf;
-    if (index != NULL) index[symb] = tree[n].index;
-    if (path != NULL) path[symb] = tree[n].path;
-    if (len != NULL) len[symb] = tree[n].len;
-    return symb + 1;
-  } else {
-    symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len);
-    return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len);
-  }
-}
-
-int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
-                aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index,
-                int *path, int *len) {
-  tree_node symb[2 * 16 - 1];
-  int nodes;
-  int next[16];
-  int size;
-  int nsymbs;
-  int i;
-  /* Create the root node with probability 1 in Q15. */
-  symb[0].index = root;
-  symb[0].path = 0;
-  symb[0].len = 0;
-  symb[0].l = symb[0].r = 0;
-  nodes = 1;
-  next[0] = 0;
-  size = 1;
-  nsymbs = 1;
-  while (size > 0 && nsymbs < 16) {
-    int m;
-    tree_node n;
-    aom_tree_index j;
-    uint8_t prob;
-    m = 0;
-    /* Find the internal node with the largest probability. */
-    for (i = 1; i < size; i++) {
-      if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i;
-    }
-    i = next[m];
-    memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1)));
-    size--;
-    /* Split this symbol into two symbols */
-    n = symb[i];
-    j = n.index;
-    prob = probs[j >> 1];
-    /* Left */
-    n.index = tree[j];
-    n.path <<= 1;
-    n.len++;
-    n.probs[n.len - 1] = prob;
-    symb[nodes] = n;
-    if (n.index > 0) {
-      next[size++] = nodes;
-    }
-    /* Right */
-    n.index = tree[j + 1];
-    n.path += 1;
-    n.probs[n.len - 1] = 256 - prob;
-    symb[nodes + 1] = n;
-    if (n.index > 0) {
-      next[size++] = nodes + 1;
-    }
-    symb[i].prob = prob;
-    symb[i].l = nodes;
-    symb[i].r = nodes + 1;
-    nodes += 2;
-    nsymbs++;
-  }
-  /* Compute the probabilities of each symbol in Q15 */
-  tree_node_compute_probs(symb, 0, CDF_PROB_TOP);
-  /* Extract the cdf, index, path and length */
-  tree_node_extract(symb, 0, 0, cdf, index, path, len);
-  /* Convert to CDF */
-  cdf[0] = AOM_ICDF(cdf[0]);
-  for (i = 1; i < nsymbs; i++) {
-    cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]);
-  }
-  // Store symbol count at the end of the CDF
-  cdf[nsymbs] = 0;
-  return nsymbs;
-}
-
-/* This code assumes that tree contains as unique leaf nodes the integer values
-    0 to len - 1 and produces the forward and inverse mapping tables in ind[]
-    and inv[] respectively. */
-static void tree_to_index(int *stack_index, int *ind, int *inv,
-                          const aom_tree_index *tree, int value, int index) {
-  value *= 2;
-
-  do {
-    const aom_tree_index content = tree[index];
-    ++index;
-    if (content <= 0) {
-      inv[*stack_index] = -content;
-      ind[-content] = *stack_index;
-      ++(*stack_index);
-    } else {
-      tree_to_index(stack_index, ind, inv, tree, value, content);
-    }
-  } while (++value & 1);
-}
-
-void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) {
-  int stack_index = 0;
-  tree_to_index(&stack_index, ind, inv, tree, 0, 0);
-}
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
index a517e810a..85dd4249d 100644
--- a/third_party/aom/aom_dsp/prob.h
+++ b/third_party/aom/aom_dsp/prob.h
@@ -13,194 +13,657 @@
 #define AOM_DSP_PROB_H_
 
 #include <assert.h>
+#include <stdio.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_common.h"
+#include "config/aom_config.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/entcode.h"
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem.h"
 
-#if !CONFIG_ANS
-#include "aom_dsp/entcode.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef uint8_t aom_prob;
-
 // TODO(negge): Rename this aom_prob once we remove vpxbool.
 typedef uint16_t aom_cdf_prob;
 
 #define CDF_SIZE(x) ((x) + 1)
-
 #define CDF_PROB_BITS 15
 #define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+#define CDF_INIT_TOP 32768
+#define CDF_SHIFT (15 - CDF_PROB_BITS)
+/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
+  probability (an "inverse" CDF).
+  This function converts from one representation to the other (and is its own
+  inverse).*/
+#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
+
+#if CDF_SHIFT == 0
+
+#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2) \
+  AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3) \
+  AOM_ICDF(a0)                   \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                        \
+  AOM_ICDF(a0)                                              \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                                  \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                              \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)                          \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)                     \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)                 \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)               \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11)          \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12)     \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
 
-#if !CONFIG_ANS
-#define AOM_ICDF OD_ICDF
 #else
-#define AOM_ICDF(x) (x)
-#endif
-
-#define MAX_PROB 255
-
-#define LV_MAP_PROB 1
+#define AOM_CDF2(a0)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \
+            ((CDF_INIT_TOP - 2) >> 1)) /                   \
+               ((CDF_INIT_TOP - 2)) +                      \
+           1)                                              \
+  , AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) +     \
+            ((CDF_INIT_TOP - 3) >> 1)) /                       \
+               ((CDF_INIT_TOP - 3)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
+                ((CDF_INIT_TOP - 3) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 3)) +                      \
+               2),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2)                                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) +     \
+            ((CDF_INIT_TOP - 4) >> 1)) /                       \
+               ((CDF_INIT_TOP - 4)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               3),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3)                               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) +     \
+            ((CDF_INIT_TOP - 5) >> 1)) /                       \
+               ((CDF_INIT_TOP - 5)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               4),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) +     \
+            ((CDF_INIT_TOP - 6) >> 1)) /                       \
+               ((CDF_INIT_TOP - 6)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               5),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) +     \
+            ((CDF_INIT_TOP - 7) >> 1)) /                       \
+               ((CDF_INIT_TOP - 7)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               6),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) +     \
+            ((CDF_INIT_TOP - 8) >> 1)) /                       \
+               ((CDF_INIT_TOP - 8)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               7),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) +     \
+            ((CDF_INIT_TOP - 9) >> 1)) /                       \
+               ((CDF_INIT_TOP - 9)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               7),                                             \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               8),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) +     \
+            ((CDF_INIT_TOP - 10) >> 1)) /                       \
+               ((CDF_INIT_TOP - 10)) +                          \
+           1)                                                   \
+  ,                                                             \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               2),                                              \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               3),                                              \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               4),                                              \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               5),                                              \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               6),                                              \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               7),                                              \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               8),                                              \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               9),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +      \
+            ((CDF_INIT_TOP - 11) >> 1)) /                        \
+               ((CDF_INIT_TOP - 11)) +                           \
+           1)                                                    \
+  ,                                                              \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               2),                                               \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               3),                                               \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               4),                                               \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               5),                                               \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               6),                                               \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               7),                                               \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               8),                                               \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               9),                                               \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               10),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)    \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +       \
+            ((CDF_INIT_TOP - 12) >> 1)) /                         \
+               ((CDF_INIT_TOP - 12)) +                            \
+           1)                                                     \
+  ,                                                               \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               2),                                                \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               3),                                                \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               4),                                                \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               5),                                                \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               6),                                                \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               7),                                                \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               8),                                                \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               9),                                                \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +  \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               10),                                               \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               11),                                               \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +         \
+            ((CDF_INIT_TOP - 13) >> 1)) /                           \
+               ((CDF_INIT_TOP - 13)) +                              \
+           1)                                                       \
+  ,                                                                 \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               2),                                                  \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               3),                                                  \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               4),                                                  \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               5),                                                  \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               6),                                                  \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               7),                                                  \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               8),                                                  \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               9),                                                  \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +    \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               10),                                                 \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               11),                                                 \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               12),                                                 \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +              \
+            ((CDF_INIT_TOP - 14) >> 1)) /                                \
+               ((CDF_INIT_TOP - 14)) +                                   \
+           1)                                                            \
+  ,                                                                      \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               2),                                                       \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               3),                                                       \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               4),                                                       \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               5),                                                       \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               6),                                                       \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               7),                                                       \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               8),                                                       \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               9),                                                       \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +         \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               10),                                                      \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               11),                                                      \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               12),                                                      \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               13),                                                      \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +                   \
+            ((CDF_INIT_TOP - 15) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 15)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +              \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               14),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +                   \
+            ((CDF_INIT_TOP - 16) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 16)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +              \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               14),                                                           \
+      AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               15),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
 
-#define BR_NODE 1
-
-#if CONFIG_ADAPT_SCAN
-#define CACHE_SCAN_PROB 1
 #endif
 
-#define aom_prob_half ((aom_prob)128)
-
-typedef int8_t aom_tree_index;
-
-#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
-
-#define MODE_MV_COUNT_SAT 20
-
-/* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of aom_tree_indices.
-   Array index often references a corresponding probability table.
-   Index <= 0 means done encoding/decoding and value = -Index,
-   Index > 0 means need another bit, specification at index.
-   Nonnegative indices are always even;  processing begins at node 0. */
-
-typedef const aom_tree_index aom_tree[];
-
-static INLINE aom_prob get_prob(unsigned int num, unsigned int den) {
+static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
   assert(den != 0);
   {
     const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
     // (p > 255) ? 255 : (p < 1) ? 1 : p;
     const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
-    return (aom_prob)clipped_prob;
+    return (uint8_t)clipped_prob;
   }
 }
 
-static INLINE aom_prob get_binary_prob(unsigned int n0, unsigned int n1) {
-  const unsigned int den = n0 + n1;
-  if (den == 0) return 128u;
-  return get_prob(n0, den);
-}
-
-/* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) {
-  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
-}
-
-static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  const aom_prob prob = get_binary_prob(ct[0], ct[1]);
-  const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat);
-  const unsigned int factor = max_update_factor * count / count_sat;
-  return weighted_prob(pre_prob, prob, factor);
-}
-
-// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
-static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
-  0,  6,  12, 19, 25, 32,  38,  44,  51,  57, 64,
-  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
-};
-
-static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob,
-                                           const unsigned int ct[2]) {
-  const unsigned int den = ct[0] + ct[1];
-  if (den == 0) {
-    return pre_prob;
-  } else {
-    const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT);
-    const unsigned int factor = count_to_update_factor[count];
-    const aom_prob prob = get_prob(ct[0], den);
-    return weighted_prob(pre_prob, prob, factor);
-  }
-}
-
-void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
-                          const unsigned int *counts, aom_prob *probs);
-
-int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
-                aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
-                int *pth, int *len);
-
-static INLINE void av1_tree_to_cdf(const aom_tree_index *tree,
-                                   const aom_prob *probs, aom_cdf_prob *cdf) {
-  aom_tree_index index[16];
-  int path[16];
-  int dist[16];
-  tree_to_cdf(tree, probs, 0, cdf, index, path, dist);
-}
-
-#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \
-  do {                                          \
-    int i;                                      \
-    for (i = 0; i < u; i++) {                   \
-      av1_tree_to_cdf(tree, probs[i], cdf[i]);  \
-    }                                           \
-  } while (0)
-
-#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u)     \
-  do {                                                 \
-    int j;                                             \
-    int i;                                             \
-    for (j = 0; j < v; j++) {                          \
-      for (i = 0; i < u; i++) {                        \
-        av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \
-      }                                                \
-    }                                                  \
-  } while (0)
-
-void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree);
-
 static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
-  int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs);
-#if CONFIG_LV_MAP
-  if (nsymbs == 2)
-    rate = 4 + (cdf[nsymbs] > 7) + (cdf[nsymbs] > 15) + get_msb(nsymbs);
-#endif
-  const int rate2 = 5;
+  int rate;
   int i, tmp;
-  int diff;
-#if 1
-  const int tmp0 = 1 << rate2;
-  tmp = AOM_ICDF(tmp0);
-  diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate;
-// Single loop (faster)
-#if !CONFIG_ANS
-  for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) {
-    tmp -= (i == val ? diff : 0);
-    cdf[i] += ((tmp - cdf[i]) >> rate);
-  }
-#else
-  for (i = 0; i < nsymbs - 1; ++i, tmp += tmp0) {
-    tmp += (i == val ? diff : 0);
-    cdf[i] -= ((cdf[i] - tmp) >> rate);
-  }
-#endif
-#else
-  for (i = 0; i < nsymbs; ++i) {
-    tmp = (i + 1) << rate2;
-    cdf[i] -= ((cdf[i] - tmp) >> rate);
-  }
-  diff = CDF_PROB_TOP - cdf[nsymbs - 1];
 
-  for (i = val; i < nsymbs; ++i) {
-    cdf[i] += diff;
+  static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
+                                        2, 2, 2, 2, 2, 2, 2, 2 };
+  assert(nsymbs < 17);
+  rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
+         nsymbs2speed[nsymbs];  // + get_msb(nsymbs);
+  tmp = AOM_ICDF(0);
+
+  // Single loop (faster)
+  for (i = 0; i < nsymbs - 1; ++i) {
+    tmp = (i == val) ? 0 : tmp;
+    if (tmp < cdf[i]) {
+      cdf[i] -= ((cdf[i] - tmp) >> rate);
+    } else {
+      cdf[i] += ((tmp - cdf[i]) >> rate);
+    }
   }
-#endif
   cdf[nsymbs] += (cdf[nsymbs] < 32);
 }
 
-#if CONFIG_LV_MAP
-static INLINE void update_bin(aom_cdf_prob *cdf, int val, int nsymbs) {
-  update_cdf(cdf, val, nsymbs);
-}
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
index d543f12d1..37d3bb585 100644
--- a/third_party/aom/aom_dsp/psnr.c
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/psnr.h"
 #include "aom_scale/yv12config.h"
 
@@ -26,8 +27,8 @@ double aom_sse_to_psnr(double samples, double peak, double sse) {
 }
 
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
-* and highbd_8_variance(). It should not.
-*/
+ * and highbd_8_variance(). It should not.
+ */
 static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
                              int b_stride, int w, int h, unsigned int *sse,
                              int *sum) {
@@ -48,26 +49,26 @@ static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
                                       const uint8_t *b8, int b_stride, int w,
                                       int h, uint64_t *sse, int64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
     }
+    tsum += lsum;
     a += a_stride;
     b += b_stride;
   }
+  *sum = tsum;
+  *sse = tsse;
 }
 
 static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
@@ -80,7 +81,6 @@ static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
   *sse = (unsigned int)sse_long;
   *sum = (int)sum_long;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
                        int b_stride, int width, int height) {
@@ -122,7 +122,6 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   return total_sse;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
                                     const uint8_t *b8, int b_stride, int width,
                                     int height, unsigned int input_shift) {
@@ -175,7 +174,6 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   }
   return total_sse;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
@@ -228,7 +226,6 @@ int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
                  a->uv_crop_width, a->uv_crop_height);
 }
 
-#if CONFIG_HIGHBITDEPTH
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height) {
@@ -287,11 +284,9 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
   return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
                         a->uv_crop_width, a->uv_crop_height);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
-#if CONFIG_HIGHBITDEPTH
   if (highbd) {
     switch (plane) {
       case 0: return aom_highbd_get_y_sse(a, b);
@@ -300,7 +295,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
       default: assert(plane >= 0 && plane <= 2); return 0;
     }
   }
-#endif
   (void)highbd;
   switch (plane) {
     case 0: return aom_get_y_sse(a, b);
@@ -310,7 +304,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           uint32_t bit_depth, uint32_t in_bit_depth) {
@@ -356,8 +349,6 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
       aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
 }
 
-#endif  // !CONFIG_HIGHBITDEPTH
-
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {
   static const double peak = 255.0;
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
index df5f8f9f2..8300b0a88 100644
--- a/third_party/aom/aom_dsp/psnr.h
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -27,13 +27,13 @@ typedef struct {
 } PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
-*
-* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
-*
-* \param[in]    samples       Number of samples
-* \param[in]    peak          Max sample value
-* \param[in]    sse           Sum of squared errors
-*/
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
 double aom_sse_to_psnr(double samples, double peak, double sse);
 int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                            const YV12_BUFFER_CONFIG *b, int hstart, int width,
@@ -49,7 +49,6 @@ int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
 int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, int plane, int highbd);
-#if CONFIG_HIGHBITDEPTH
 int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
                                   const YV12_BUFFER_CONFIG *b, int hstart,
                                   int width, int vstart, int height);
@@ -68,7 +67,6 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           unsigned int bit_depth, unsigned int in_bit_depth);
-#endif
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr);
 
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
index aeefd5908..30fe21d9c 100644
--- a/third_party/aom/aom_dsp/psnrhvs.c
+++ b/third_party/aom/aom_dsp/psnrhvs.c
@@ -17,17 +17,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
 #include "aom_ports/system_state.h"
 
-#if !defined(M_PI)
-#define M_PI (3.141592653589793238462643)
-#endif
-#include <string.h>
-
 static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                            int xstride) {
   int i, j;
@@ -38,7 +34,6 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -48,7 +43,6 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
     for (j = 0; j < 8; j++)
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
-#endif
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
@@ -123,14 +117,16 @@ static double convert_score_db(double _score, double _weight, int bit_depth) {
 static double calc_psnrhvs(const unsigned char *src, int _systride,
                            const unsigned char *dst, int _dystride, double _par,
                            int _w, int _h, int _step, const double _csf[8][8],
-                           uint32_t bit_depth, uint32_t _shift) {
+                           uint32_t _shift, int buf_is_hbd) {
   double ret;
   const uint8_t *_src8 = src;
   const uint8_t *_dst8 = dst;
   const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
   const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
-  int16_t dct_s[8 * 8], dct_d[8 * 8];
-  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
   double mask[8][8];
   int pixels;
   int x;
@@ -176,10 +172,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
-          if (bit_depth == 8 && _shift == 0) {
+          if (!buf_is_hbd) {
             dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
             dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
-          } else if (bit_depth == 10 || bit_depth == 12) {
+          } else {
             dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
             dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
           }
@@ -212,15 +208,12 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
         s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
       if (d_gvar > 0)
         d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
-#if CONFIG_HIGHBITDEPTH
-      if (bit_depth == 10 || bit_depth == 12) {
-        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
-      }
-#endif
-      if (bit_depth == 8) {
+      if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      } else {
+        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       }
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
@@ -256,21 +249,24 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
   const int step = 7;
   uint32_t bd_shift = 0;
   aom_clear_system_state();
-
   assert(bd == 8 || bd == 10 || bd == 12);
   assert(bd >= in_bd);
+  assert(src->flags == dst->flags);
+  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
 
   bd_shift = bd - in_bd;
 
-  *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, par, src->y_crop_width,
-                            src->y_crop_height, step, csf_y, bd, bd_shift);
-  *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, par, src->uv_crop_width,
-                            src->uv_crop_height, step, csf_cb420, bd, bd_shift);
-  *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, par, src->uv_crop_width,
-                            src->uv_crop_height, step, csf_cr420, bd, bd_shift);
+  *y_psnrhvs = calc_psnrhvs(
+      src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
+      src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd);
+  *u_psnrhvs =
+      calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cb420, bd_shift, buf_is_hbd);
+  *v_psnrhvs =
+      calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cr420, bd_shift, buf_is_hbd);
   psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
   return convert_score_db(psnrhvs, 1.0, in_bd);
 }
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
index 21bcc486a..e1601cc7d 100644
--- a/third_party/aom/aom_dsp/quantize.c
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -66,7 +66,8 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         const int dequant =
             (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
             AOM_QM_BITS;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale);
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
 
         if (tmp32) eob = i;
       }
@@ -87,11 +88,7 @@ void highbd_quantize_b_helper_c(
                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   int dequant;
-#if CONFIG_TX64X64
   int idx_arr[4096];
-#else
-  int idx_arr[1024];
-#endif
   (void)iscan;
   int idx = 0;
 
@@ -130,45 +127,14 @@ void highbd_quantize_b_helper_c(
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
                 AOM_QM_BITS;
-      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
+      const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
       if (abs_qcoeff) eob = idx_arr[i];
     }
   }
   *eob_ptr = eob + 1;
 }
 
-void quantize_dc_helper(const tran_low_t *coeff_ptr, int n_coeffs,
-                        int skip_block, const int16_t *round_ptr,
-                        const int16_t quant, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                        const qm_val_t *iqm_ptr, const int log_scale) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp;
-  int eob = -1;
-  int32_t tmp32;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
-                INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale);
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
 /* These functions should only be called when quantisation matrices
    are not used. */
 void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -195,7 +161,6 @@ void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1);
 }
 
-#if CONFIG_TX64X64
 void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block, const int16_t *zbin_ptr,
                             const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -207,34 +172,6 @@ void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
                       dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2);
 }
-#endif  // CONFIG_TX64X64
-
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  quantize_dc_helper(coeff_ptr, n_coeffs, skip_block, round_ptr, quant,
-                     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL,
-                     0);
-}
-
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  quantize_dc_helper(coeff_ptr, 1024, skip_block, round_ptr, quant, qcoeff_ptr,
-                     dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 1);
-}
-
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  quantize_dc_helper(coeff_ptr, 4096, skip_block, round_ptr, quant, qcoeff_ptr,
-                     dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 2);
-}
-#endif  // CONFIG_TX64X64
 
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block, const int16_t *zbin_ptr,
@@ -261,7 +198,6 @@ void aom_highbd_quantize_b_32x32_c(
                              NULL, NULL, 1);
 }
 
-#if CONFIG_TX64X64
 void aom_highbd_quantize_b_64x64_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
     const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -273,4 +209,3 @@ void aom_highbd_quantize_b_64x64_c(
                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
                              NULL, NULL, 2);
 }
-#endif  // CONFIG_TX64X64
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
index 03609e8b4..56d50b929 100644
--- a/third_party/aom/aom_dsp/quantize.h
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -12,7 +12,8 @@
 #ifndef AOM_DSP_QUANTIZE_H_
 #define AOM_DSP_QUANTIZE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 
 #ifdef __cplusplus
@@ -44,7 +45,6 @@ void highbd_quantize_b_helper_c(
     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
     const qm_val_t *iqm_ptr, const int log_scale);
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block, const int16_t *zbin_ptr,
                              const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -52,69 +52,6 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan);
-#endif
-
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-#if CONFIG_TX64X64
-void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif  // CONFIG_TX64X64
-
-#if CONFIG_AOM_QM
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                            const qm_val_t *iqm_ptr);
-void aom_highbd_quantize_dc_32x32(
-    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
-    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr);
-#if CONFIG_TX64X64
-void aom_highbd_quantize_dc_64x64(
-    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
-    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-
-#else  // CONFIG_AOM_QM
-
-#if CONFIG_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr);
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
-#if CONFIG_TX64X64
-void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif  // CONFIG_TX64X64
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AOM_QM
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
index 6b8ca669b..ede4c583b 100644
--- a/third_party/aom/aom_dsp/sad.c
+++ b/third_party/aom/aom_dsp/sad.c
@@ -11,8 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -33,32 +33,35 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
   return sad;
 }
 
-#define sadMxN(m, n)                                                        \
-  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    uint8_t comp_pred[m * n];                                               \
-    aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
+#define sadMxh(m)                                                          \
+  unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
+                                const uint8_t *b, int b_stride, int width, \
+                                int height) {                              \
+    return sad(a, a_stride, b, b_stride, width, height);                   \
   }
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+#define sadMxN(m, n)                                                          \
+  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref, int ref_stride) {     \
+    return sad(src, src_stride, ref, ref_stride, m, n);                       \
+  }                                                                           \
+  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
+                                        const uint8_t *ref, int ref_stride,   \
+                                        const uint8_t *second_pred) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_jnt_sad##m##x##n##_avg_c(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride,    \
+                            jcp_param);                                       \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
   }
 
-// This appears to be equivalent to the above when k == 4 and refs is const
+// Calculate sad against 4 reference locations and store each in sad_array
 #define sadMxNx4D(m, n)                                                    \
   void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
                                const uint8_t *const ref_array[],           \
@@ -70,11 +73,8 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
   }
 
 /* clang-format off */
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 // 128x128
 sadMxN(128, 128)
-sadMxNxK(128, 128, 3)
-sadMxNxK(128, 128, 8)
 sadMxNx4D(128, 128)
 
 // 128x64
@@ -84,12 +84,9 @@ sadMxNx4D(128, 64)
 // 64x128
 sadMxN(64, 128)
 sadMxNx4D(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
 // 64x64
 sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
 sadMxNx4D(64, 64)
 
 // 64x32
@@ -102,8 +99,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -116,45 +111,39 @@ sadMxNx4D(16, 32)
 
 // 16x16
 sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
 sadMxNx4D(16, 16)
 
 // 16x8
 sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
 sadMxNx4D(16, 8)
 
 // 8x16
 sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
 sadMxNx4D(8, 16)
 
 // 8x8
 sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
 sadMxNx4D(8, 8)
 
 // 8x4
 sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
 sadMxNx4D(8, 4)
 
 // 4x8
 sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
 sadMxNx4D(4, 8)
 
 // 4x4
 sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
+sadMxh(128);
+sadMxh(64);
+sadMxh(32);
+sadMxh(16);
+sadMxh(8);
+sadMxh(4);
+
 sadMxN(4, 16)
 sadMxNx4D(4, 16)
 sadMxN(16, 4)
@@ -167,15 +156,10 @@ sadMxN(16, 64)
 sadMxNx4D(16, 64)
 sadMxN(64, 16)
 sadMxNx4D(64, 16)
-sadMxN(32, 128)
-sadMxNx4D(32, 128)
-sadMxN(128, 32)
-sadMxNx4D(128, 32)
-#endif
-/* clang-format on */
-
-#if CONFIG_HIGHBITDEPTH
-                            static INLINE
+
+    /* clang-format on */
+
+    static INLINE
     unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
                             int b_stride, int width, int height) {
   int y, x;
@@ -216,19 +200,16 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *second_pred) {                                            \
     uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
+    aom_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);   \
+    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+  }                                                                            \
+  unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+    uint16_t comp_pred[m * n];                                                 \
+    aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref,            \
+                                 ref_stride, jcp_param);                       \
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
-  }
-
-#define highbd_sadMxNxK(m, n, k)                                             \
-  void aom_highbd_sad##m##x##n##x##k##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < k; ++i) {                                                \
-      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 &ref_array[i], ref_stride); \
-    }                                                                        \
   }
 
 #define highbd_sadMxNx4D(m, n)                                               \
@@ -243,11 +224,8 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
   }
 
 /* clang-format off */
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 // 128x128
 highbd_sadMxN(128, 128)
-highbd_sadMxNxK(128, 128, 3)
-highbd_sadMxNxK(128, 128, 8)
 highbd_sadMxNx4D(128, 128)
 
 // 128x64
@@ -257,12 +235,9 @@ highbd_sadMxNx4D(128, 64)
 // 64x128
 highbd_sadMxN(64, 128)
 highbd_sadMxNx4D(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 
 // 64x64
 highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
 highbd_sadMxNx4D(64, 64)
 
 // 64x32
@@ -275,8 +250,6 @@ highbd_sadMxNx4D(32, 64)
 
 // 32x32
 highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
 highbd_sadMxNx4D(32, 32)
 
 // 32x16
@@ -289,45 +262,32 @@ highbd_sadMxNx4D(16, 32)
 
 // 16x16
 highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
 highbd_sadMxNx4D(16, 16)
 
 // 16x8
 highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 8x16
 highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
 highbd_sadMxNx4D(8, 16)
 
 // 8x8
 highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x4
 highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
 highbd_sadMxNx4D(8, 4)
 
 // 4x8
 highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 4x4
 highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 highbd_sadMxN(4, 16)
 highbd_sadMxNx4D(4, 16)
 highbd_sadMxN(16, 4)
@@ -340,277 +300,4 @@ highbd_sadMxN(16, 64)
 highbd_sadMxNx4D(16, 64)
 highbd_sadMxN(64, 16)
 highbd_sadMxNx4D(64, 16)
-highbd_sadMxN(32, 128)
-highbd_sadMxNx4D(32, 128)
-highbd_sadMxN(128, 32)
-highbd_sadMxNx4D(128, 32)
-#endif
-/* clang-format on */
-#endif  // CONFIG_HIGHBITDEPTH
-
-#if CONFIG_AV1
-                                                static INLINE
-    unsigned int masked_sad(const uint8_t *src, int src_stride,
-                            const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride, const uint8_t *m, int m_stride,
-                            int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const uint8_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define MASKSADMxN(m, n)                                                       \
-  unsigned int aom_masked_sad##m##x##n##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
-      int invert_mask) {                                                       \
-    if (!invert_mask)                                                          \
-      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
-                        msk_stride, m, n);                                     \
-    else                                                                       \
-      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
-                        msk_stride, m, n);                                     \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-MASKSADMxN(128, 128)
-MASKSADMxN(128, 64)
-MASKSADMxN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-MASKSADMxN(64, 64)
-MASKSADMxN(64, 32)
-MASKSADMxN(32, 64)
-MASKSADMxN(32, 32)
-MASKSADMxN(32, 16)
-MASKSADMxN(16, 32)
-MASKSADMxN(16, 16)
-MASKSADMxN(16, 8)
-MASKSADMxN(8, 16)
-MASKSADMxN(8, 8)
-MASKSADMxN(8, 4)
-MASKSADMxN(4, 8)
-MASKSADMxN(4, 4)
-
-#if CONFIG_EXT_PARTITION_TYPES
-MASKSADMxN(4, 16)
-MASKSADMxN(16, 4)
-MASKSADMxN(8, 32)
-MASKSADMxN(32, 8)
-MASKSADMxN(16, 64)
-MASKSADMxN(64, 16)
-MASKSADMxN(32, 128)
-MASKSADMxN(128, 32)
-#endif
-/* clang-format on */
-
-#if CONFIG_HIGHBITDEPTH
-                                static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
-                                   const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define HIGHBD_MASKSADMXN(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
-      int msk_stride, int invert_mask) {                                \
-    if (!invert_mask)                                                   \
-      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
-                               second_pred8, m, msk, msk_stride, m, n); \
-    else                                                                \
-      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
-                               ref_stride, msk, msk_stride, m, n);      \
-  }
-
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN(128, 128)
-HIGHBD_MASKSADMXN(128, 64)
-HIGHBD_MASKSADMXN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN(64, 64)
-HIGHBD_MASKSADMXN(64, 32)
-HIGHBD_MASKSADMXN(32, 64)
-HIGHBD_MASKSADMXN(32, 32)
-HIGHBD_MASKSADMXN(32, 16)
-HIGHBD_MASKSADMXN(16, 32)
-HIGHBD_MASKSADMXN(16, 16)
-HIGHBD_MASKSADMXN(16, 8)
-HIGHBD_MASKSADMXN(8, 16)
-HIGHBD_MASKSADMXN(8, 8)
-HIGHBD_MASKSADMXN(8, 4)
-HIGHBD_MASKSADMXN(4, 8)
-HIGHBD_MASKSADMXN(4, 4)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
-HIGHBD_MASKSADMXN(4, 16)
-HIGHBD_MASKSADMXN(16, 4)
-HIGHBD_MASKSADMXN(8, 32)
-HIGHBD_MASKSADMXN(32, 8)
-HIGHBD_MASKSADMXN(16, 64)
-HIGHBD_MASKSADMXN(64, 16)
-HIGHBD_MASKSADMXN(32, 128)
-HIGHBD_MASKSADMXN(128, 32)
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1
-
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
-// pre: predictor being evaluated
-// wsrc: target weighted prediction (has been *4096 to keep precision)
-// mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define OBMCSADMxN(m, n)                                                     \
-  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
-                                         const int32_t *wsrc,                \
-                                         const int32_t *mask) {              \
-    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-OBMCSADMxN(128, 128)
-OBMCSADMxN(128, 64)
-OBMCSADMxN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-OBMCSADMxN(64, 64)
-OBMCSADMxN(64, 32)
-OBMCSADMxN(32, 64)
-OBMCSADMxN(32, 32)
-OBMCSADMxN(32, 16)
-OBMCSADMxN(16, 32)
-OBMCSADMxN(16, 16)
-OBMCSADMxN(16, 8)
-OBMCSADMxN(8, 16)
-OBMCSADMxN(8, 8)
-OBMCSADMxN(8, 4)
-OBMCSADMxN(4, 8)
-OBMCSADMxN(4, 4)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
-OBMCSADMxN(4, 16)
-OBMCSADMxN(16, 4)
-OBMCSADMxN(8, 32)
-OBMCSADMxN(32, 8)
-OBMCSADMxN(16, 64)
-OBMCSADMxN(64, 16)
-OBMCSADMxN(32, 128)
-OBMCSADMxN(128, 32)
-#endif
-/* clang-format on */
-
-#if CONFIG_HIGHBITDEPTH
-                                static INLINE
-    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define HIGHBD_OBMCSADMXN(m, n)                                \
-  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
-      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
-      const int32_t *mask) {                                   \
-    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-HIGHBD_OBMCSADMXN(128, 128)
-HIGHBD_OBMCSADMXN(128, 64)
-HIGHBD_OBMCSADMXN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_OBMCSADMXN(64, 64)
-HIGHBD_OBMCSADMXN(64, 32)
-HIGHBD_OBMCSADMXN(32, 64)
-HIGHBD_OBMCSADMXN(32, 32)
-HIGHBD_OBMCSADMXN(32, 16)
-HIGHBD_OBMCSADMXN(16, 32)
-HIGHBD_OBMCSADMXN(16, 16)
-HIGHBD_OBMCSADMXN(16, 8)
-HIGHBD_OBMCSADMXN(8, 16)
-HIGHBD_OBMCSADMXN(8, 8)
-HIGHBD_OBMCSADMXN(8, 4)
-HIGHBD_OBMCSADMXN(4, 8)
-HIGHBD_OBMCSADMXN(4, 4)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
-HIGHBD_OBMCSADMXN(4, 16)
-HIGHBD_OBMCSADMXN(16, 4)
-HIGHBD_OBMCSADMXN(8, 32)
-HIGHBD_OBMCSADMXN(32, 8)
-HIGHBD_OBMCSADMXN(16, 64)
-HIGHBD_OBMCSADMXN(64, 16)
-HIGHBD_OBMCSADMXN(32, 128)
-HIGHBD_OBMCSADMXN(128, 32)
-#endif
-/* clang-format on */
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
new file mode 100644
index 000000000..c176001d6
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad_av1.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+                                      const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride, int width,
+                                      int height) {
+  int y, x;
+  unsigned int sad = 0;
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+  return sad;
+}
+
+#define MASKSADMxN(m, n)                                                       \
+  unsigned int aom_masked_sad##m##x##n##_c(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
+      int invert_mask) {                                                       \
+    if (!invert_mask)                                                          \
+      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n);                                     \
+    else                                                                       \
+      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
+                        msk_stride, m, n);                                     \
+  }
+
+/* clang-format off */
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+MASKSADMxN(4, 16)
+MASKSADMxN(16, 4)
+MASKSADMxN(8, 32)
+MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
+                                   const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
+      int msk_stride, int invert_mask) {                                \
+    if (!invert_mask)                                                   \
+      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
+                               second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                \
+      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
+                               ref_stride, msk, msk_stride, m, n);      \
+  }
+
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+HIGHBD_MASKSADMXN(4, 16)
+HIGHBD_MASKSADMXN(16, 4)
+HIGHBD_MASKSADMXN(8, 32)
+HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                     \
+  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+                                         const int32_t *wsrc,                \
+                                         const int32_t *mask) {              \
+    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
+  }
+
+/* clang-format off */
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+OBMCSADMxN(4, 16)
+OBMCSADMxN(16, 4)
+OBMCSADMxN(8, 32)
+OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                \
+  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+      const int32_t *mask) {                                   \
+    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+  }
+
+/* clang-format off */
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+HIGHBD_OBMCSADMXN(4, 16)
+HIGHBD_OBMCSADMXN(16, 4)
+HIGHBD_OBMCSADMXN(8, 32)
+HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+/* clang-format on */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
index 8f6509383..51a38a7e1 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -15,8 +15,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./v128_intrinsics_c.h"
-#include "./v64_intrinsics.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
 
 /* Fallback to plain, unoptimised C. */
 
@@ -57,6 +58,7 @@ SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
 
 typedef uint32_t sad128_internal;
 SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
@@ -74,9 +76,15 @@ SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
 SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
   return c_v128_ssd_u8_sum(s);
 }
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  return c_v128_dotp_su8(a, b);
+}
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   return c_v128_dotp_s16(a, b);
 }
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  return c_v128_dotp_s32(a, b);
+}
 SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
 
 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
@@ -86,8 +94,12 @@ SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
 
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
 SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
 SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
 SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
@@ -96,6 +108,7 @@ SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
 SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
 SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
 SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
 SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
 
@@ -112,8 +125,16 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  return c_v128_blend_8(a, b, c);
+}
+
 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return c_v128_rdavg_u16(a, b);
+}
 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
 SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
@@ -121,6 +142,8 @@ SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
 SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
 
 SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
 SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
@@ -168,6 +191,9 @@ SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
   return c_v128_pack_s32_s16(a, b);
 }
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return c_v128_pack_s32_u16(a, b);
+}
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return c_v128_pack_s16_u8(a, b);
 }
@@ -203,6 +229,14 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
 }
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
 
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
   return c_v128_shl_8(a, c);
 }
@@ -230,6 +264,15 @@ SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
   return c_v128_shr_s32(a, c);
 }
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return c_v128_shr_s64(a, c);
+}
 
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
   return c_v128_shr_n_byte(a, n);
@@ -246,6 +289,9 @@ SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
   return c_v128_shl_n_32(a, n);
 }
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+  return c_v128_shl_n_64(a, n);
+}
 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
   return c_v128_shr_n_u8(a, n);
 }
@@ -255,6 +301,9 @@ SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
   return c_v128_shr_n_u32(a, n);
 }
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+  return c_v128_shr_n_u64(a, n);
+}
 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
   return c_v128_shr_n_s8(a, n);
 }
@@ -264,5 +313,32 @@ SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
   return c_v128_shr_n_s32(a, n);
 }
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+  return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+  return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+  return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return c_v128_ssd_s16_sum(s);
+}
 
 #endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
index 0377d4ce1..d4fec4237 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
@@ -13,7 +13,8 @@
 #define _V128_INTRINSICS_H
 
 #include <arm_neon.h>
-#include "./v64_intrinsics_arm.h"
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
 
 typedef int64x2_t v128;
 
@@ -28,7 +29,7 @@ SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
 SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
 
 SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
+  return vcombine_s64((int64x1_t)b, (int64x1_t)a);
 }
 
 SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
@@ -83,22 +84,57 @@ SIMD_INLINE v128 v128_dup_32(uint32_t x) {
   return vreinterpretq_s64_u32(vdupq_n_u32(x));
 }
 
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  return vreinterpretq_s64_u64(vdupq_n_u64(x));
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  int16x8_t t1 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
+  int16x8_t t2 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
+#else
+  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+#endif
+}
+
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
          v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
 }
 
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  int64x2_t t = vpaddlq_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+}
+
 SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
+#if defined(__aarch64__)
+  return vaddlvq_u8(vreinterpretq_u8_s64(x));
+#else
   uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
   return vget_lane_s32(
       vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
 }
 
 SIMD_INLINE v128 v128_padd_s16(v128 a) {
   return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
 }
 
-typedef struct { sad64_internal hi, lo; } sad128_internal;
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
+}
+
+typedef struct {
+  sad64_internal hi, lo;
+} sad128_internal;
 
 SIMD_INLINE sad128_internal v128_sad_u8_init() {
   sad128_internal s;
@@ -117,14 +153,21 @@ SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
 }
 
 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo));
+#if defined(__aarch64__)
+  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
+  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+#endif
 }
 
-typedef struct { ssd64_internal hi, lo; } ssd128_internal;
+typedef struct {
+  ssd64_internal hi, lo;
+} ssd128_internal;
 
 SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
   ssd128_internal s;
-  s.hi = s.lo = (ssd64_internal)(uint64_t)0;
+  s.hi = s.lo = v64_ssd_u8_init();
   return s;
 }
 
@@ -154,6 +197,16 @@ SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
       vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
 }
 
+SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
 SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
   return vreinterpretq_s64_s16(
       vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
@@ -169,6 +222,11 @@ SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
       vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
 }
 
+SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
+  return vreinterpretq_s64_u64(
+      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
+}
+
 SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
   return vreinterpretq_s64_u8(
       vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
@@ -204,6 +262,8 @@ SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
       vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
 }
 
+SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
+
 SIMD_INLINE v128 v128_abs_s16(v128 x) {
   return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
 }
@@ -223,8 +283,16 @@ SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
 }
 
 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_s16(vuzp2q_s16(
+      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                                      vreinterpret_s16_s64(vget_low_s64(b)))),
+      vreinterpretq_s16_s32(
+          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
+#else
   return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
                        v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
@@ -233,13 +301,32 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
 }
 
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                           vreinterpret_s16_s64(vget_low_s64(b)));
+  int32x4_t t2 =
+      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
+  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
+#else
   return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
                        v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int16x8_t t1 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
+  int16x8_t t2 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
+#else
   return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
                        v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
@@ -252,6 +339,11 @@ SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
       vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
 }
 
+SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
 SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
   return vreinterpretq_s64_u16(
       vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
@@ -272,6 +364,26 @@ SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
       vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
 }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
+  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
+#if defined(__aarch64__)
+  uint8x16_t m =
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
+#else
+  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
+  return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+#endif
+}
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+}
+
 SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
   return vreinterpretq_s64_s8(
       vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
@@ -287,14 +399,34 @@ SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
       vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
 }
 
+SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
 SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
@@ -303,13 +435,23 @@ SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
 }
 
 SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
   return vreinterpretq_s64_s16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
   return vreinterpretq_s64_s16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
@@ -318,13 +460,23 @@ SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
   return vreinterpretq_s64_s32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
   return vreinterpretq_s64_s32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
@@ -333,47 +485,76 @@ SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
 }
 
 SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return v128_from_v64(vget_low_u64((uint64x2_t)a),
-                       vget_low_u64((uint64x2_t)b));
+  return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
 }
 
 SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return v128_from_v64(vget_high_u64((uint64x2_t)a),
-                       vget_high_u64((uint64x2_t)b));
+  return v128_from_v64(vget_high_s64((int64x2_t)a),
+                       vget_high_s64((int64x2_t)b));
 }
 
 SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   uint16x8x2_t r =
       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
   return vreinterpretq_s64_u16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   uint16x8x2_t r =
       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
   return vreinterpretq_s64_u16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   uint32x4x2_t r =
       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
   return vreinterpretq_s64_u32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   uint32x4x2_t r =
       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
   return vreinterpretq_s64_u32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
@@ -406,6 +587,12 @@ SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
       vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
 }
 
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
+}
+
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return v128_from_v64(
       vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
@@ -447,15 +634,17 @@ SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
 }
 
 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-  return v128_from_64(
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_high_s64(pattern)))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
+#else
+  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
+                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
+  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
+                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#endif
 }
 
 SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
@@ -488,19 +677,37 @@ SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
       vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
 }
 
+SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
-                                     vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
 }
 
 SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
-                                     vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
 }
 
 SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8(
-                                     vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
+  return (c > 7) ? v128_ones()
+                 : vreinterpretq_s64_s8(
+                       vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
 }
 
 SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
@@ -539,6 +746,22 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
                         vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
 }
 
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
+}
+
 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
@@ -562,16 +785,18 @@ SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
   return n < 8
              ? v128_from_64(
-                   vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
-                   vorr_u64(
+                   (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                        n * 8),
+                   (uint64_t)vorr_u64(
                        vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
                        vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
                                   (8 - n) * 8)))
-             : (n == 8
-                    ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a)))
-                    : v128_from_64(
-                          0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                        (n - 8) * 8)));
+             : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+                                             vget_high_s64(a)))
+                       : v128_from_64(
+                             0, (uint64_t)vshr_n_u64(
+                                    vreinterpret_u64_s64(vget_high_s64(a)),
+                                    (n - 8) * 8)));
 }
 
 SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
@@ -610,6 +835,18 @@ SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
   return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
 }
 
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return vshrq_n_s64(a, c);
+}
+
 #else
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
@@ -666,6 +903,55 @@ SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
   return v128_shr_s32(a, c);
 }
 
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return v128_shl_64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return v128_shr_u64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return v128_shr_s64(a, c);
+}
+
 #endif
 
+typedef uint32x4_t sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return vaddq_u32(
+      s, vpaddlq_u16(vsubq_u16(
+             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
+             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  uint64x2_t t = vpaddlq_u32(s);
+  return (uint32_t)(uint64_t)vget_high_u64(t) +
+         (uint32_t)(uint64_t)vget_low_u64(t);
+}
+
+typedef v128 ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(
+      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
 #endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
index 32e7c32de..e508f6ad7 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./v64_intrinsics_c.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
 
 typedef union {
   uint8_t u8[16];
@@ -115,11 +117,30 @@ SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
   return t;
 }
 
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+         c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
   return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
          c_v64_dotp_s16(a.v64[0], b.v64[0]);
 }
 
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+  // 32 bit products, 64 bit sum
+  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
   return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
 }
@@ -186,6 +207,16 @@ SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
                          c_v64_add_16(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
                          c_v64_sadd_s16(a.v64[0], b.v64[0]));
@@ -196,6 +227,15 @@ SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
                          c_v64_add_32(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+  // Two complement overflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+                                   : a.v64[1].u64 + b.v64[1].u64,
+      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+                                   : a.v64[0].u64 + b.v64[0].u64);
+}
+
 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
   c_v128 t;
   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
@@ -205,6 +245,19 @@ SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
   return t;
 }
 
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+  c_v128 t;
+  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
                          c_v64_sub_8(a.v64[0], b.v64[0]));
@@ -240,6 +293,15 @@ SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
                          c_v64_sub_32(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+  // Two complement underflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+                                  : a.v64[1].u64 - b.v64[1].u64,
+      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+                                  : a.v64[0].u64 - b.v64[0].u64);
+}
+
 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
   return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
 }
@@ -290,6 +352,11 @@ SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
                          c_v64_rdavg_u8(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
                          c_v64_avg_u16(a.v64[0], b.v64[0]));
@@ -310,6 +377,22 @@ SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
                          c_v64_min_s8(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+  c_v128 t;
+  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
                          c_v64_max_s8(a.v64[0], b.v64[0]));
@@ -325,6 +408,20 @@ SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
                          c_v64_max_s16(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
                          c_v64_ziplo_8(a.v64[0], b.v64[0]));
@@ -518,6 +615,11 @@ SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
                          c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
                          c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
@@ -559,15 +661,10 @@ SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
   c_v128 t;
   int c;
-  for (c = 0; c < 16; c++) {
-    if (pattern.u8[c] & ~15) {
-      fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
+  for (c = 0; c < 16; c++)
     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
                                      : pattern.u8[c] & 15];
-  }
+
   return t;
 }
 
@@ -601,7 +698,28 @@ SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
                          c_v64_cmpeq_16(a.v64[0], b.v64[0]));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
   if (n < 8)
     return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
                                     c_v64_shr_n_byte(a.v64[0], 8 - n)),
@@ -610,7 +728,7 @@ SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) {
     return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
   if (n < 8)
     return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
                            c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
@@ -619,7 +737,7 @@ SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) {
     return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
 }
 
-SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
   if (SIMD_CHECK && c > 15) {
     fprintf(stderr, "Error: undefined alignment %d\n", c);
     abort();
@@ -628,80 +746,143 @@ SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) {
            : b;
 }
 
-SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
                          c_v64_shr_u16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
                          c_v64_shr_s16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
                          c_v64_shr_u32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
                          c_v64_shr_s32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 <<= c;
+  a.v64[0].u64 <<= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 >>= c;
+  a.v64[0].u64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+  a.v64[1].s64 >>= c;
+  a.v64[0].s64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
   return c_v128_shl_8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
   return c_v128_shl_16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
   return c_v128_shl_32(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+  return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
   return c_v128_shr_u8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
   return c_v128_shr_u16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
   return c_v128_shr_u32(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
   return c_v128_shr_s8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
   return c_v128_shr_s16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
   return c_v128_shr_s32(a, n);
 }
 
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
 #endif /* _V128_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
index cca1788d5..f9043fe99 100644
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -12,7 +12,8 @@
 #ifndef _V128_INTRINSICS_H
 #define _V128_INTRINSICS_H
 
-#include "./v64_intrinsics_x86.h"
+#include <stdint.h>
+#include "aom_dsp/simd/v64_intrinsics_x86.h"
 
 typedef __m128i v128;
 
@@ -62,7 +63,7 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
 // Some compilers will check this during optimisation, others wont.
 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 #if defined(__SSSE3__)
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
   return c ? _mm_alignr_epi8(a, b, c) : b;
 }
 #else
@@ -71,7 +72,7 @@ SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
 #endif
 #else
 #if defined(__SSSE3__)
-#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
 #else
 #define v128_align(a, b, c) \
   ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
@@ -86,14 +87,25 @@ SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
 
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
 
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+  return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
+}
+
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
 
 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
 
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
 
 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
 
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
 SIMD_INLINE v128 v128_padd_s16(v128 a) {
   return _mm_madd_epi16(a, _mm_set1_epi16(1));
 }
@@ -112,6 +124,8 @@ SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
 
 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
 
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
 SIMD_INLINE v128 v128_abs_s16(v128 a) {
 #if defined(__SSSE3__)
   return _mm_abs_epi16(a);
@@ -241,6 +255,15 @@ SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
   return _mm_packs_epi32(b, a);
 }
 
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(b, a);
+#else
+  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return _mm_packus_epi16(b, a);
 }
@@ -291,6 +314,15 @@ SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
 #endif
 }
 
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+  v128 t = v128_add_32(t1, t2);
+  t = v128_add_32(t, _mm_srli_si128(t, 8));
+  t = v128_add_32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   v128 r = _mm_madd_epi16(a, b);
 #if defined(__SSE4_1__) && defined(__x86_64__)
@@ -325,31 +357,25 @@ SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
   return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
 }
 
-typedef v128 ssd128_internal;
+typedef int32_t ssd128_internal;
 
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_sum(). */
 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                         _mm_unpacklo_epi8(b, _mm_setzero_si128()));
-  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
-                         _mm_unpackhi_epi8(b, _mm_setzero_si128()));
+  v128 z = _mm_setzero_si128();
+  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
   v128 rl = _mm_madd_epi16(l, l);
   v128 rh = _mm_madd_epi16(h, h);
-  v128 c = _mm_cvtsi32_si128(32);
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
-  return _mm_add_epi64(
-      s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
+  v128 r = _mm_add_epi32(rl, rh);
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+  return s + _mm_cvtsi128_si32(r);
 }
 
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
 
 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
 
@@ -385,6 +411,14 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
 #endif
 }
 
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  v128 r = v128_mullo_s32(a, b);
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
 
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
@@ -399,6 +433,10 @@ SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
 #endif
 }
 
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
 
 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
@@ -406,6 +444,11 @@ SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
 }
 
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
 
 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
@@ -421,6 +464,17 @@ SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
 #endif
 }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(a, b, c);
+#else
+  c = _mm_cmplt_epi8(c, v128_zero());
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
 #if defined(__SSE4_1__)
   return _mm_max_epi8(a, b);
@@ -434,6 +488,24 @@ SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
 
 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
 
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
 SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
 
 SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
@@ -448,6 +520,16 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
   return _mm_cmplt_epi16(a, b);
 }
 
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
@@ -490,10 +572,25 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
   return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
 }
 
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  // _mm_sra_epi64 is missing in gcc?
+  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
+                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+}
+
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
-#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
 #define v128_shl_n_8(a, c) \
   _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
 #define v128_shr_n_u8(a, c) \
@@ -507,5 +604,53 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
 #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
 #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
 #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+#if defined(__SSE4_1__)
+  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+                          v128_xor(b, v128_dup_16(32768)));
+  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+                  v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+  return v128_add_32(
+      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+         v128_low_u32(v128_shr_n_byte(s, 8)) +
+         v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+                                    _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
 
 #endif /* _V128_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
index 1896374ee..0e5ae5b68 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -15,9 +15,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./v256_intrinsics_c.h"
-#include "./v128_intrinsics.h"
-#include "./v64_intrinsics.h"
+
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
 
 /* Fallback to plain, unoptimised C. */
 
@@ -25,6 +26,7 @@ typedef c_v256 v256;
 
 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
 SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
 SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
 SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
@@ -59,6 +61,7 @@ SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
 
 typedef uint32_t sad256_internal;
 SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
@@ -76,9 +79,16 @@ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
   return c_v256_ssd_u8_sum(s);
 }
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return c_v256_dotp_su8(a, b);
+}
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
   return c_v256_dotp_s16(a, b);
 }
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return c_v256_dotp_s32(a, b);
+}
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
 
 SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
@@ -88,8 +98,13 @@ SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
 SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
@@ -114,8 +129,16 @@ SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
 
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return c_v256_blend_8(a, b, c);
+}
+
 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return c_v256_rdavg_u16(a, b);
+}
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
@@ -123,6 +146,8 @@ SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
 
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
@@ -159,6 +184,12 @@ SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
   return c_v256_unziphi_32(a, b);
 }
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return c_v256_unziphi_64(a, b);
+}
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
   return c_v256_unpacklo_u8_s16(a);
@@ -176,6 +207,9 @@ SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
   return c_v256_pack_s32_s16(a, b);
 }
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return c_v256_pack_s32_u16(a, b);
+}
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
   return c_v256_pack_s16_u8(a, b);
 }
@@ -203,6 +237,9 @@ SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
   return c_v256_shuffle_8(a, pattern);
 }
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  return c_v256_wideshuffle_8(a, b, pattern);
+}
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
   return c_v256_pshuffle_8(a, pattern);
 }
@@ -217,7 +254,14 @@ SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
   return c_v256_cmplt_s16(a, b);
 }
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
 
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return c_v256_cmplt_s32(a, b);
+}
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return c_v256_shl_8(a, c);
 }
@@ -261,6 +305,9 @@ SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
 SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
   return c_v256_shl_n_32(a, n);
 }
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+  return c_v256_shl_n_64(a, n);
+}
 SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
   return c_v256_shr_n_u8(a, n);
 }
@@ -270,6 +317,9 @@ SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
 SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
   return c_v256_shr_n_u32(a, n);
 }
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+  return c_v256_shr_n_u64(a, n);
+}
 SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
   return c_v256_shr_n_s8(a, n);
 }
@@ -279,5 +329,39 @@ SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
 SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
   return c_v256_shr_n_s32(a, n);
 }
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+  return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+  return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+  return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd256_internal_s16;
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
index ba4ed719d..d96638488 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
@@ -12,6 +12,6 @@
 #ifndef _V256_INTRINSICS_H
 #define _V256_INTRINSICS_H
 
-#include "./v256_intrinsics_v128.h"
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
index f96ca7fa6..5b412df71 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -14,8 +14,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./v128_intrinsics_c.h"
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
 
 typedef union {
   uint8_t u8[32];
@@ -34,6 +36,8 @@ SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
 
 SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
 
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
 SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
 
 SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
@@ -120,23 +124,39 @@ SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
   return t;
 }
 
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+         c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
 SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
   return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
          c_v128_dotp_s16(a.v128[0], b.v128[0]);
 }
 
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
 SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
   return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
 }
 
 typedef uint32_t c_sad256_internal;
 
-SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
    v256_sad_u8_sum().
    The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
                                             c_v256 b) {
   int c;
   for (c = 0; c < 32; c++)
@@ -191,6 +211,16 @@ SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
                           c_v128_add_16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
                           c_v128_sadd_s16(a.v128[0], b.v128[0]));
@@ -201,6 +231,23 @@ SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
                           c_v128_add_32(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+                          c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+                          c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+  c_v256 t;
+  for (int i = 0; i < 16; i++)
+    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
   c_v256 t;
   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
@@ -299,6 +346,11 @@ SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
                           c_v128_rdavg_u8(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
                           c_v128_avg_u16(a.v128[0], b.v128[0]));
@@ -319,6 +371,30 @@ SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
                           c_v128_min_s8(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+  c_v256 t;
+  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
                           c_v128_max_s8(a.v128[0], b.v128[0]));
@@ -334,6 +410,16 @@ SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
                           c_v128_max_s16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+                          c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+                          c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
                           c_v128_ziplo_8(a.v128[0], b.v128[0]));
@@ -482,6 +568,32 @@ SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
                            : _c_v256_unzip_32(b, a, 1);
 }
 
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u64[3] = b.u64[3];
+    t.u64[2] = b.u64[1];
+    t.u64[1] = a.u64[3];
+    t.u64[0] = a.u64[1];
+  } else {
+    t.u64[3] = a.u64[2];
+    t.u64[2] = a.u64[0];
+    t.u64[1] = b.u64[2];
+    t.u64[0] = b.u64[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+                           : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+                           : _c_v256_unzip_64(b, a, 1);
+}
+
 SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
   return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
 }
@@ -515,6 +627,11 @@ SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
                           c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
                           c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
@@ -558,15 +675,21 @@ SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
 SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
   c_v256 t;
   int c;
-  for (c = 0; c < 32; c++) {
-    if (pattern.u8[c] & ~31) {
-      fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
+  for (c = 0; c < 32; c++)
     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
                                      : pattern.u8[c] & 31];
-  }
+
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = (pattern.u8[c] < 32
+                   ? b.u8
+                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                             : pattern.u8[c] & 31];
   return t;
 }
 
@@ -607,6 +730,21 @@ SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
                           c_v128_cmpeq_16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
   if (n < 16)
     return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
@@ -685,6 +823,45 @@ SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
                           c_v128_shr_s32(a.v128[0], c));
 }
 
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.s64[3] = a.s64[3] >> n;
+  t.s64[2] = a.s64[2] >> n;
+  t.s64[1] = a.s64[1] >> n;
+  t.s64[0] = a.s64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] >> n;
+  t.u64[2] = a.u64[2] >> n;
+  t.u64[1] = a.u64[1] >> n;
+  t.u64[0] = a.u64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] << n;
+  t.u64[2] = a.u64[2] << n;
+  t.u64[1] = a.u64[1] << n;
+  t.u64[0] = a.u64[0] << n;
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
   return c_v256_shl_8(a, n);
 }
@@ -697,6 +874,10 @@ SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
   return c_v256_shl_32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+  return c_v256_shl_64(a, n);
+}
+
 SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
   return c_v256_shr_u8(a, n);
 }
@@ -709,6 +890,10 @@ SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
   return c_v256_shr_u32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+  return c_v256_shr_u64(a, n);
+}
+
 SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
   return c_v256_shr_s8(a, n);
 }
@@ -721,4 +906,48 @@ SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
   return c_v256_shr_s32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+  return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
 #endif /* _V256_INTRINSICS_C_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
index cbea55ca1..60b2a1791 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -13,27 +13,35 @@
 #define _V256_INTRINSICS_V128_H
 
 #if HAVE_NEON
-#include "./v128_intrinsics_arm.h"
+#include "aom_dsp/simd/v128_intrinsics_arm.h"
 #elif HAVE_SSE2
-#include "./v128_intrinsics_x86.h"
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
 #else
-#include "./v128_intrinsics.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
 #endif
 
-typedef struct { v128 lo, hi; } v256;
+#if HAVE_NEON
+typedef int64x2x2_t v256;
+#else
+typedef struct {
+  v128 val[2];
+} v256;
+#endif
 
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
 
-SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
 
-SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
 
-SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
 
 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
   v256 t;
-  t.hi = hi;
-  t.lo = lo;
+  t.val[1] = hi;
+  t.val[0] = lo;
   return t;
 }
 
@@ -56,13 +64,13 @@ SIMD_INLINE v256 v256_load_aligned(const void *p) {
 }
 
 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  v128_store_unaligned(p, a.lo);
-  v128_store_unaligned((uint8_t *)p + 16, a.hi);
+  v128_store_unaligned(p, a.val[0]);
+  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
 }
 
 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  v128_store_aligned(p, a.lo);
-  v128_store_aligned((uint8_t *)p + 16, a.hi);
+  v128_store_aligned(p, a.val[0]);
+  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
 }
 
 SIMD_INLINE v256 v256_zero() {
@@ -84,23 +92,35 @@ SIMD_INLINE v256 v256_dup_32(uint32_t x) {
   return v256_from_v128(t, t);
 }
 
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  v128 t = v128_dup_64(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
+  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
 }
 
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
+  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
 }
 
 typedef struct {
-  sad128_internal hi;
-  sad128_internal lo;
+  sad128_internal val[2];
 } sad256_internal;
 
 SIMD_INLINE sad256_internal v256_sad_u8_init() {
   sad256_internal t;
-  t.hi = v128_sad_u8_init();
-  t.lo = v128_sad_u8_init();
+  t.val[1] = v128_sad_u8_init();
+  t.val[0] = v128_sad_u8_init();
   return t;
 }
 
@@ -109,24 +129,23 @@ SIMD_INLINE sad256_internal v256_sad_u8_init() {
    The result for more than 16 v256_sad_u8() calls is undefined. */
 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
   sad256_internal t;
-  t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
+  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
   return t;
 }
 
 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
+  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
 }
 
 typedef struct {
-  ssd128_internal hi;
-  ssd128_internal lo;
+  ssd128_internal val[2];
 } ssd256_internal;
 
 SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
   ssd256_internal t;
-  t.hi = v128_ssd_u8_init();
-  t.lo = v128_ssd_u8_init();
+  t.val[1] = v128_ssd_u8_init();
+  t.val[0] = v128_ssd_u8_init();
   return t;
 }
 
@@ -134,85 +153,124 @@ SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
  * v256_ssd_u8_sum(). */
 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
   ssd256_internal t;
-  t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
+  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
   return t;
 }
 
 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
+  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
 }
 
 SIMD_INLINE v256 v256_or(v256 a, v256 b) {
-  return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
+  return v256_from_v128(v128_or(a.val[1], b.val[1]),
+                        v128_or(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
-  return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
+  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+                        v128_xor(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_and(v256 a, v256 b) {
-  return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
+  return v256_from_v128(v128_and(a.val[1], b.val[1]),
+                        v128_and(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
-  return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
+  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+                        v128_andn(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
-  return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
+  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+                        v128_add_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
-  return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
+  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+                        v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+                        v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+                        v128_sadd_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
+  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+                        v128_sadd_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
-  return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
+  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+                        v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+                        v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
+  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
+  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+                        v128_sub_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+                        v128_ssub_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+                        v128_ssub_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
+  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+                        v128_sub_16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+                        v128_ssub_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+                        v128_ssub_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
+  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+                        v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+                        v128_sub_64(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_abs_s16(v256 a) {
-  return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
+  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_abs_s8(v256 a) {
-  return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo));
+  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
@@ -223,99 +281,146 @@ SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
 }
 
 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
+  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+                        v128_mullo_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
+  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+                        v128_mulhi_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
+  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+                        v128_mullo_s32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
+  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+                        v128_madd_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
+  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+                        v128_madd_us8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
+  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+                        v128_avg_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
+  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+                        v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+                        v128_rdavg_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
+  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+                        v128_avg_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
+  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+                        v128_min_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
+  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+                        v128_max_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
+  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+                        v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (v128_movemask_8(v256_high_v128(a)) << 16) |
+         v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
+  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+                        v128_max_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
+  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+                        v128_min_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
+  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+                        v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+                        v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+                        v128_max_s32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+                        v128_ziplo_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+                        v128_ziplo_8(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+                        v128_ziplo_16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+                        v128_ziplo_16(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+                        v128_ziplo_32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+                        v128_ziplo_32(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+                        v128_ziplo_64(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+                        v128_ziplo_64(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(a.lo, b.lo);
+  return v256_from_v128(a.val[0], b.val[0]);
 }
 
 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(a.hi, b.hi);
+  return v256_from_v128(a.val[1], b.val[1]);
 }
 
 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
@@ -331,31 +436,59 @@ SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
 }
 
 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+                        v128_unziplo_8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+                        v128_unziphi_8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
-                        v128_unziplo_16(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+                        v128_unziplo_16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
-                        v128_unziphi_16(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+                        v128_unziphi_16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
-                        v128_unziplo_32(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+                        v128_unziplo_32(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
-                        v128_unziphi_32(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+                        v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 0)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 0)));
+#else
+  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 3)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 3)));
+#else
+  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
 }
 
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
@@ -363,11 +496,13 @@ SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+                        v128_unpacklo_u8_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+                        v128_unpacklo_u8_s16(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
@@ -375,26 +510,33 @@ SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo));
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+                        v128_unpacklo_s8_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi));
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+                        v128_unpacklo_s8_s16(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
-                        v128_pack_s32_s16(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+                        v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+                        v128_pack_s32_u16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
-                        v128_pack_s16_u8(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+                        v128_pack_s16_u8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
-                        v128_pack_s16_s8(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+                        v128_pack_s16_s8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
@@ -406,142 +548,326 @@ SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
-                        v128_unpacklo_u16_s32(a.lo));
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+                        v128_unpacklo_u16_s32(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
-                        v128_unpacklo_s16_s32(a.lo));
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+                        v128_unpacklo_s16_s32(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
-                        v128_unpacklo_u16_s32(a.hi));
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+                        v128_unpacklo_u16_s32(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
-                        v128_unpacklo_s16_s32(a.hi));
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
-  v128 masklo = v128_cmplt_s8(pattern.lo, c16);
-  return v256_from_v128(
-      v128_or(
-          v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
-          v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
-      v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
-              v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
-                        masklo)));
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return v256_from_v128(
-      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
-      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+                        v128_unpacklo_s16_s32(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
+  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+                        v128_cmpgt_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
+  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+                        v128_cmplt_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
+  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+                        v128_cmpeq_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
+  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+                        v128_cmpgt_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
+  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+                        v128_cmplt_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
+  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+                        v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+                        v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+                        v128_cmplt_s32(a.val[0], b.val[0]));
 }
 
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+                        v128_cmpeq_32(a.val[0], b.val[0]));
 }
 
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
+                       vreinterpretq_u8_s64(x.val[1]) } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  return v256_from_64(
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+  return v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+#endif
 }
 
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x4_t p = { {
+      vreinterpretq_u8_s64(y.val[0]),
+      vreinterpretq_u8_s64(y.val[1]),
+      vreinterpretq_u8_s64(x.val[0]),
+      vreinterpretq_u8_s64(x.val[1]),
+  } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
+  v256 r1 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
+  v256 r2 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 c32 = v128_dup_8(32);
+  v128 c48 = v128_dup_8(48);
+  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+  v256 r1 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+                   maskhi48),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+                   masklo48));
+  v256 r2 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+#endif
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return v256_from_v128(
+      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
 }
 
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n),                \
-                                     v128_shr_n_byte(a.lo, (16 - (n)) & 31)), \
-                             v128_shl_n_byte(a.lo, (n)))                      \
-            : v256_from_v128(                                                 \
-                  (n) > 16 ? v128_shl_n_byte(a.lo, ((n)-16) & 31) : a.lo,     \
+#define v256_shl_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
+                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
+                             v128_shl_n_byte(a.val[0], (n)))               \
+            : v256_from_v128(                                              \
+                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
                   v128_zero()))
 
-#define v256_shr_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n),                        \
-                             v128_or(v128_shr_n_byte(a.lo, n),                \
-                                     v128_shl_n_byte(a.hi, (16 - (n)) & 31))) \
-            : v256_from_v128(                                                 \
-                  v128_zero(),                                                \
-                  (n) > 16 ? v128_shr_n_byte(a.hi, ((n)-16) & 31) : a.hi))
+#define v256_shr_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                             v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
+            : v256_from_v128(                                              \
+                  v128_zero(),                                             \
+                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
 
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
 #define v256_shl_n_8(a, n) \
-  v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
+  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
 #define v256_shl_n_16(a, n) \
-  v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
+  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
 #define v256_shl_n_32(a, n) \
-  v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
+  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
 #define v256_shr_n_u8(a, n) \
-  v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
+  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
 #define v256_shr_n_u16(a, n) \
-  v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
+  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
 #define v256_shr_n_u32(a, n) \
-  v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
+  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
 #define v256_shr_n_s8(a, n) \
-  v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
+  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
 #define v256_shr_n_s16(a, n) \
-  v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
+  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
 #define v256_shr_n_s32(a, n) \
-  v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
+  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+  sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16_init();
+  t.val[0] = v128_sad_u16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum().
+   The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16_init();
+  t.val[0] = v128_ssd_s16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
 
 #endif /* _V256_INTRINSICS_V128_H */
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
index b82daab68..05f205169 100644
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -14,7 +14,7 @@
 
 #if !defined(__AVX2__)
 
-#include "./v256_intrinsics_v128.h"
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
 
 #else
 
@@ -26,7 +26,8 @@
 #endif
 
 #include <immintrin.h>
-#include "./v128_intrinsics_x86.h"
+
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
 
 typedef __m256i v256;
 
@@ -38,9 +39,9 @@ SIMD_INLINE v64 v256_low_v64(v256 a) {
   return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
 }
 
-SIMD_INLINE v128 v256_low_v128(v256 a) {
-  return _mm256_extracti128_si256(a, 0);
-}
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
 
 SIMD_INLINE v128 v256_high_v128(v256 a) {
   return _mm256_extracti128_si256(a, 1);
@@ -48,8 +49,7 @@ SIMD_INLINE v128 v256_high_v128(v256 a) {
 
 SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
   // gcc seems to be missing _mm256_set_m128i()
-  return _mm256_insertf128_si256(
-      _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
 }
 
 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
@@ -84,16 +84,28 @@ SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
 
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
 
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
 
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
 
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
   return _mm256_adds_epi16(a, b);
 }
 
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
 
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
 SIMD_INLINE v256 v256_padd_s16(v256 a) {
   return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
 }
@@ -116,6 +128,8 @@ SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
 
 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
 
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
 SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
 
 SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
@@ -125,43 +139,51 @@ SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
 // unpack/pack intrinsics operate on the 256 bit input vector as 2
 // independent 128 bit vectors.
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
@@ -184,34 +206,54 @@ SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
   return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
 }
 
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
+  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
 }
 
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
+  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
 }
 
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(3, 1, 3, 1))),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(2, 0, 2, 0))),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+                                            _mm256_castsi256_pd(a), 15)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(
+          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
@@ -219,13 +261,15 @@ SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
-                        v128_unpacklo_u8_s16(v256_low_v128(a)));
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
-                        v128_unpacklo_u8_s16(v256_high_v128(a)));
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
@@ -233,28 +277,37 @@ SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)),
-                        v128_unpacklo_s8_s16(v256_low_v128(a)));
+  return _mm256_srai_epi16(
+      _mm256_unpacklo_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
 }
 
 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)),
-                        v128_unpacklo_s8_s16(v256_high_v128(a)));
+  return _mm256_srai_epi16(
+      _mm256_unpackhi_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
 }
 
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
@@ -266,43 +319,73 @@ SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
-                        v128_unpacklo_u16_s32(v256_low_v128(a)));
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
-                        v128_unpacklo_s16_s32(v256_low_v128(a)));
+  return _mm256_srai_epi32(
+      _mm256_unpacklo_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
 }
 
 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
-                        v128_unpacklo_u16_s32(v256_high_v128(a)));
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
-                        v128_unpacklo_s16_s32(v256_high_v128(a)));
+  return _mm256_srai_epi32(
+      _mm256_unpackhi_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
 }
+
 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 hi = v256_high_v128(pattern);
-  v128 lo = v256_low_v128(pattern);
-  v128 maskhi = v128_cmplt_s8(hi, c16);
-  v128 masklo = v128_cmplt_s8(lo, c16);
-  return v256_from_v128(
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
-                        maskhi)),
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
-                        masklo)));
+  return _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  v256 r1 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+  v256 r2 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
 }
 
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
   return _mm256_shuffle_epi8(a, pattern);
 }
 
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+  t1 = _mm256_add_epi32(t1, t2);
+  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+                         _mm256_extracti128_si256(t1, 1));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
   v256 r = _mm256_madd_epi16(a, b);
 #if defined(__x86_64__)
@@ -326,6 +409,29 @@ SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
 #endif
 }
 
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
   v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
   v128 lo = v256_low_v128(t);
@@ -341,7 +447,7 @@ SIMD_INLINE sad256_internal v256_sad_u8_init() {
 }
 
 /* Implementation dependent return value.  Result must be finalised with
-   v256_sad_sum().
+   v256_sad_u8_sum().
    The result for more than 32 v256_sad_u8() calls is undefined. */
 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
   return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
@@ -359,7 +465,7 @@ SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
 }
 
 /* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_sum(). */
+ * v256_ssd_u8_sum(). */
 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
   v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
                             _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
@@ -425,6 +531,12 @@ SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
       _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
 }
 
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return _mm256_sub_epi16(
+      _mm256_avg_epu16(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
 
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
@@ -433,18 +545,28 @@ SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
 
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return _mm256_blendv_epi8(a, b, c);
+}
+
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
 
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
 
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
 
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
   return _mm256_cmpgt_epi8(a, b);
 }
 
 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
+  return _mm256_cmpgt_epi8(b, a);
 }
 
 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
@@ -456,13 +578,25 @@ SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
 }
 
 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
+  return _mm256_cmpgt_epi16(b, a);
 }
 
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
   return _mm256_cmpeq_epi16(a, b);
 }
 
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
                           _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
@@ -503,27 +637,42 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
   return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
 }
 
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512F__)
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+#else
+  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+                        v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
 // _mm256_slli_si256 works on 128 bit lanes and can't be used
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16                                                                   \
-       ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n),        \
-                                v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
-                        v128_shl_n_byte(v256_low_v128(a), n))                 \
-       : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16),            \
-                        v128_zero()))
+#define v256_shl_n_byte(a, n)                                                \
+  ((n) < 16 ? v256_from_v128(                                                \
+                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+                  v128_shl_n_byte(v256_low_v128(a), n))                      \
+            : _mm256_inserti128_si256(                                       \
+                  _mm256_setzero_si256(),                                    \
+                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
 
 // _mm256_srli_si256 works on 128 bit lanes and can't be used
-#define v256_shr_n_byte(a, n)                                                 \
-  ((n) < 16                                                                   \
-       ? _mm256_alignr_epi8(                                                  \
-             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
-       : ((n) > 16                                                            \
-              ? _mm256_srli_si256(                                            \
-                    _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
-                    (n)-16)                                                   \
-              : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
+#define v256_shr_n_byte(a, n)                                                \
+  ((n) < 16                                                                  \
+       ? _mm256_alignr_epi8(                                                 \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+       : _mm256_inserti128_si256(                                            \
+             _mm256_setzero_si256(),                                         \
+             v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
 
 // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
 #define v256_align(a, b, c) \
@@ -543,6 +692,59 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
 #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
 #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
 #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+#if defined(__SSE4_1__)
+  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+                          v256_xor(b, v256_dup_16(32768)));
+  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+                  v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+  return v256_add_32(
+      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+         v128_low_u32(v128_shr_n_byte(t, 8)) +
+         v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  v256 d = v256_sub_16(a, b);
+  d = v256_madd_s16(d, d);
+  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+                                    _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
 #endif
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
index 5c0042d8c..6ce53c6a9 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -14,7 +14,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./v64_intrinsics_c.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
 
 /* Fallback to plain, unoptimised C. */
 
@@ -71,6 +72,8 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
 
 SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
 SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
@@ -100,6 +103,9 @@ SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
 SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
   return c_v64_pack_s32_s16(a, b);
 }
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+  return c_v64_pack_s32_u16(a, b);
+}
 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
   return c_v64_pack_s16_u8(a, b);
 }
@@ -156,6 +162,7 @@ SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
 
 SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
 SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
 SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
index c7574eef5..267441b02 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
@@ -13,7 +13,8 @@
 #define _V64_INTRINSICS_H
 
 #include <arm_neon.h>
-#include "./v64_intrinsics_arm.h"
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
 #include "aom_ports/arm.h"
 
 #ifdef AOM_INCOMPATIBLE_GCC
@@ -121,20 +122,34 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) {
 }
 
 SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
-  int64x2_t r = vpaddlq_s32(vpaddlq_s16(
+  int16x8_t t =
       vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
-                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
+                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t);
+#else
+  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
   return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+#endif
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vaddlvq_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+#else
   int64x2_t r =
       vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
   return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+#endif
 }
 
 SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
+#if defined(__aarch64__)
+  return vaddlv_u8(vreinterpret_u8_s64(x));
+#else
   return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+#endif
 }
 
 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
@@ -145,34 +160,40 @@ typedef uint16x8_t sad64_internal;
 
 SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
 
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
+// Implementation dependent return value. Result must be finalised with
+// v64_sad_u8_sum().
 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
   return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
 }
 
 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s);
+#else
   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
   return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+#endif
 }
 
-typedef int64x1_t ssd64_internal;
+typedef uint32x4_t ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
-  return (ssd64_internal)(uint64_t)0;
-}
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
 
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
+// Implementation dependent return value. Result must be finalised with
+// v64_ssd_u8_sum().
 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
   uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
-  return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
+  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
 }
 
 SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-  return (uint32_t)(uint64_t)s;
+#if defined(__aarch64__)
+  return vaddvq_u32(s);
+#else
+  uint64x2_t t = vpaddlq_u32(s);
+  return vget_lane_u32(
+      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
 }
 
 SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
@@ -188,6 +209,16 @@ SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
       vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
 }
 
+SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
 SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
   return vreinterpret_s64_s16(
       vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
@@ -252,8 +283,14 @@ SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  int16x8_t t = vreinterpretq_s16_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
+#else
   return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
       vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
+#endif
 }
 
 SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
@@ -269,10 +306,10 @@ SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
-      vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
-                         vreinterpret_s8_s64(y)),
-                vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
+  int16x8_t t =
+      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
+                vmovl_s8(vreinterpret_s8_s64(y)));
+  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
 }
 
 SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
@@ -285,6 +322,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
       vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
 }
 
+SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
 SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
   return vreinterpret_s64_u16(
       vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
@@ -321,33 +363,63 @@ SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
   return vreinterpret_s64_s16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
   return vreinterpret_s64_s16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
   return vreinterpret_s64_s32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
   return vreinterpret_s64_s32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
@@ -371,6 +443,11 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
 }
 
+SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(vqmovun_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
 SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
   return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
@@ -382,23 +459,43 @@ SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
 }
 
 SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
   return vreinterpret_s64_u16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
   return vreinterpret_s64_u16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
index 5032238b6..8158899cb 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -17,7 +17,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include "./aom_config.h"
+
+#include "config/aom_config.h"
 
 typedef union {
   uint8_t u8[8];
@@ -30,13 +31,17 @@ typedef union {
   int64_t s64;
 } c_v64;
 
-SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+  return a.u32[!!CONFIG_BIG_ENDIAN];
+}
 
 SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
   return a.u32[!CONFIG_BIG_ENDIAN];
 }
 
-SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+  return a.s32[!!CONFIG_BIG_ENDIAN];
+}
 
 SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
   return a.s32[!CONFIG_BIG_ENDIAN];
@@ -45,7 +50,7 @@ SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
 SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
   c_v64 t;
   t.u32[!CONFIG_BIG_ENDIAN] = x;
-  t.u32[CONFIG_BIG_ENDIAN] = y;
+  t.u32[!!CONFIG_BIG_ENDIAN] = y;
   return t;
 }
 
@@ -177,6 +182,30 @@ SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
+                  ? 255
+                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
+                        ? 0
+                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
+                  ? 127
+                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
+                        ? -128
+                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
@@ -206,8 +235,7 @@ SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
 SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 8; c++)
-    t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c];
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
   return t;
 }
 
@@ -459,6 +487,20 @@ SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
+  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
+  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
+  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   if (CONFIG_BIG_ENDIAN) {
@@ -670,6 +712,13 @@ SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
index 8dcc9f6fc..130052ee1 100644
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -90,8 +90,7 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
   _mm_storel_epi64((__m128i *)p, a);
 }
 
-// The following function requires an immediate.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 #define v64_align(a, b, c) \
   ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
 #else
@@ -112,6 +111,10 @@ SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
 
 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
 
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
 
 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
@@ -170,6 +173,22 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
   return _mm_packs_epi32(t, t);
 }
 
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi32(t, t);
+#else
+  int32_t ah = v64_high_u32(a);
+  int32_t al = v64_low_u32(a);
+  int32_t bh = v64_high_u32(b);
+  int32_t bl = v64_low_u32(b);
+  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
+                     al > 65535 ? 65535 : al < 0 ? 0 : al,
+                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
+                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
+#endif
+}
+
 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
   __m128i t = _mm_unpacklo_epi64(b, a);
   return _mm_packus_epi16(t, t);
@@ -272,14 +291,11 @@ SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
 }
 
 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
-  __m128i r, r1, r2, z;
-  z = _mm_setzero_si128();
-  r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
-                      _mm_unpacklo_epi8(b, z));
-  r2 = _mm_srli_si128(r1, 8);
-  r = _mm_add_epi32(r1, r2);
-  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
-  return ((int32_t)v64_low_u32(r)) >> 8;
+  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v64_low_u32(t);
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
@@ -371,6 +387,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
 }
 
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
 
 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
index 6ae378ff2..6ce3d7acb 100644
--- a/third_party/aom/aom_dsp/ssim.c
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -11,7 +11,9 @@
 
 #include <assert.h>
 #include <math.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
@@ -31,6 +33,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
     }
   }
 }
+
 void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
@@ -46,7 +49,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
                                  int rp, uint32_t *sum_s, uint32_t *sum_r,
                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -62,7 +64,6 @@ void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
     }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
@@ -108,7 +109,6 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
-#if CONFIG_HIGHBITDEPTH
 static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
                               int rp, uint32_t bd, uint32_t shift) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
@@ -117,7 +117,6 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
   return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
                     sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
@@ -142,7 +141,6 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
   return ssim_total;
 }
 
-#if CONFIG_HIGHBITDEPTH
 static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
                                int stride_img1, int stride_img2, int width,
                                int height, uint32_t bd, uint32_t shift) {
@@ -164,7 +162,6 @@ static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
   ssim_total /= samples;
   return ssim_total;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
                      const YV12_BUFFER_CONFIG *dest, double *weight) {
@@ -422,7 +419,6 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   return inconsistency_total;
 }
 
-#if CONFIG_HIGHBITDEPTH
 double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest, double *weight,
                             uint32_t bd, uint32_t in_bd) {
@@ -441,4 +437,3 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
   *weight = 1;
   return abc[0] * .8 + .1 * (abc[1] + abc[2]);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
index 902735e50..c8a389dfe 100644
--- a/third_party/aom/aom_dsp/ssim.h
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -18,7 +18,8 @@
 extern "C" {
 #endif
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_scale/yv12config.h"
 
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
@@ -75,11 +76,9 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
-#if CONFIG_HIGHBITDEPTH
 double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest, double *weight,
                             uint32_t bd, uint32_t in_bd);
-#endif  // CONFIG_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
index 8dda96efb..2f6da96e5 100644
--- a/third_party/aom/aom_dsp/subtract.c
+++ b/third_party/aom/aom_dsp/subtract.c
@@ -11,8 +11,8 @@
 
 #include <stdlib.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -32,7 +32,6 @@ void aom_subtract_block_c(int rows, int cols, int16_t *diff,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
                                  ptrdiff_t diff_stride, const uint8_t *src8,
                                  ptrdiff_t src_stride, const uint8_t *pred8,
@@ -52,4 +51,3 @@ void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
     src += src_stride;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
index b9155fdc0..44ec41f2e 100644
--- a/third_party/aom/aom_dsp/sum_squares.c
+++ b/third_party/aom/aom_dsp/sum_squares.c
@@ -11,7 +11,7 @@
 
 #include <assert.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
                                   int height) {
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
index ef9e9bc98..7deb0aea3 100644
--- a/third_party/aom/aom_dsp/txfm_common.h
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -28,25 +28,11 @@ typedef struct txfm_param {
   TX_SIZE tx_size;
   int lossless;
   int bd;
-#if CONFIG_MRC_TX || CONFIG_LGT
-  int is_inter;
-#endif  // CONFIG_MRC_TX || CONFIG_LGT
-#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-  int stride;
-  uint8_t *dst;
-#if CONFIG_MRC_TX
-  int *valid_mask;
-  uint8_t *mask;
-#endif  // CONFIG_MRC_TX
-#if CONFIG_LGT_FROM_PRED
-  int mode;
-  int use_lgt;
-#endif  // CONFIG_LGT_FROM_PRED
-#endif  // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED
-// for inverse transforms only
-#if CONFIG_ADAPT_SCAN
-  const int16_t *eob_threshold;
-#endif
+  // are the pixel buffers octets or shorts?  This should collapse to
+  // bd==8 implies !is_hbd, but that's not certain right now.
+  int is_hbd;
+  TxSetType tx_set_type;
+  // for inverse transforms only
   int eob;
 } TxfmParam;
 
@@ -102,647 +88,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
   return rv;
 }
 
-#if CONFIG_LGT_FROM_PRED
-// Use negative numbers so they do not coincide with lgt*[0][0], which are
-// always nonnegative.
-typedef enum {
-  DCT4 = -1,
-  ADST4 = -2,
-  DCT8 = -3,
-  ADST8 = -4,
-  DCT16 = -5,
-  ADST16 = -6,
-  DCT32 = -7,
-  ADST32 = -8,
-} ButterflyLgt;
-
-/* These are some LGTs already implementated in the codec. When any of them
- * is chosen, the flgt or ilgt function will call the existing fast
- * transform instead of the matrix product implementation. Thus, we
- * do not need the actual basis functions here */
-static const tran_high_t lgt4_000[1][1] = { { (tran_high_t)DCT4 } };
-static const tran_high_t lgt4_100[1][1] = { { (tran_high_t)ADST4 } };
-static const tran_high_t lgt8_000[1][1] = { { (tran_high_t)DCT8 } };
-static const tran_high_t lgt8_200[1][1] = { { (tran_high_t)ADST8 } };
-static const tran_high_t lgt16_000[1][1] = { { (tran_high_t)DCT16 } };
-static const tran_high_t lgt16_200[1][1] = { { (tran_high_t)ADST16 } };
-static const tran_high_t lgt32_000[1][1] = { { (tran_high_t)DCT32 } };
-static const tran_high_t lgt32_200[1][1] = { { (tran_high_t)ADST32 } };
-
-/* The Line Graph Transforms (LGTs) matrices are written as follows.
-   Each 2D array is sqrt(2)*16384 times an LGT matrix, which is the
-   matrix of eigenvectors of the graph Laplacian matrix of the associated
-   line graph. Some of those transforms have fast algorithms but not
-   implemented yet for now. */
-
-// LGT4 name: lgt4_150_000w3
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_150_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 5991, 13537, 17825, 0 },
-  { 15515, 10788, -13408, 0 },
-  { 16133, -15403, 6275, 0 },
-};
-
-// LGT4 name: lgt4_100_000w3
-// Self loops: 1.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_100_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 7600, 13694, 17076, 0 },
-  { 17076, 7600, -13694, 0 },
-  { 13694, -17076, 7600, 0 },
-};
-
-// LGT4 name: lgt4_060_000w3
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_060_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 9449, 13755, 16075, 0 },
-  { 17547, 4740, -14370, 0 },
-  { 11819, -18034, 8483, 0 },
-};
-
-// LGT4 name: lgt4_000w3
-// Self loops: 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000
-static const tran_high_t lgt4_000w3[4][4] = {
-  { 0, 0, 0, 23170 },
-  { 13377, 13377, 13377, 0 },
-  { 16384, 0, -16384, 0 },
-  { 9459, -18919, 9459, 0 },
-};
-
-// LGT4 name: lgt4_150_000w2
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_150_000w2[4][4] = {
-  { 10362, 20724, 0, 0 },
-  { 20724, -10362, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_100_000w2
-// Self loops: 1.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_100_000w2[4][4] = {
-  { 12181, 19710, 0, 0 },
-  { 19710, -12181, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_060_000w2
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_060_000w2[4][4] = {
-  { 13831, 18590, 0, 0 },
-  { 18590, -13831, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_000w2
-// Self loops: 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000
-static const tran_high_t lgt4_000w2[4][4] = {
-  { 16384, 16384, 0, 0 },
-  { 16384, -16384, 0, 0 },
-  { 0, 0, 16384, 16384 },
-  { 0, 0, 16384, -16384 },
-};
-
-// LGT4 name: lgt4_150_000w1
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_150_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_100_000w1
-// Self loops: 1.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_100_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_060_000w1
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_060_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_000w1
-// Self loops: 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000
-static const tran_high_t lgt4_000w1[4][4] = {
-  { 23170, 0, 0, 0 },
-  { 0, 13377, 13377, 13377 },
-  { 0, 16384, 0, -16384 },
-  { 0, 9459, -18919, 9459 },
-};
-
-// LGT4 name: lgt4_060
-// Self loops: 0.600, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_060[4][4] = {
-  { 6971, 10504, 13060, 14400 },
-  { 14939, 11211, -2040, -13559 },
-  { 14096, -8258, -12561, 10593 },
-  { 8150, -15253, 14295, -5784 },
-};
-
-// LGT4 name: lgt4_150
-// Self loops: 1.500, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_150[4][4] = {
-  { 3998, 9435, 13547, 15759 },
-  { 11106, 15105, 1886, -13483 },
-  { 15260, -1032, -14674, 9361 },
-  { 12833, -14786, 11596, -4372 },
-};
-
-// LGT8 name: lgt8_150_000w7
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_150_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 2522, 6185, 9551, 12461, 14775, 16381, 17204, 0 },
-  { 7390, 15399, 16995, 11515, 1240, -9551, -16365, 0 },
-  { 11716, 16625, 3560, -13353, -15831, -1194, 14733, 0 },
-  { 15073, 8866, -14291, -10126, 13398, 11308, -12401, 0 },
-  { 16848, -4177, -13724, 14441, 2923, -16628, 9513, 0 },
-  { 15942, -14888, 5405, 7137, -15640, 15288, -6281, 0 },
-  { 10501, -14293, 16099, -15670, 13063, -8642, 3021, 0 },
-};
-
-// LGT8 name: lgt8_100_000w7
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_100_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 3518, 6883, 9946, 12575, 14654, 16093, 16829, 0 },
-  { 9946, 16093, 16093, 9946, 0, -9946, -16093, 0 },
-  { 14654, 14654, 0, -14654, -14654, 0, 14654, 0 },
-  { 16829, 3518, -16093, -6883, 14654, 9946, -12575, 0 },
-  { 16093, -9946, -9946, 16093, 0, -16093, 9946, 0 },
-  { 12575, -16829, 9946, 3518, -14654, 16093, -6883, 0 },
-  { 6883, -12575, 16093, -16829, 14654, -9946, 3518, 0 },
-};
-
-// LGT8 name: lgt8_060_000w7
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_060_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 5087, 7951, 10521, 12701, 14411, 15587, 16186, 0 },
-  { 13015, 16486, 14464, 7621, -1762, -10557, -15834, 0 },
-  { 16581, 11475, -4050, -15898, -13311, 1362, 14798, 0 },
-  { 16536, -1414, -16981, -3927, 15746, 8879, -12953, 0 },
-  { 14104, -13151, -7102, 16932, -1912, -15914, 10385, 0 },
-  { 10156, -17168, 11996, 1688, -14174, 16602, -7249, 0 },
-  { 5295, -11721, 15961, -17224, 15274, -10476, 3723, 0 },
-};
-
-// LGT8 name: lgt8_000w7
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000
-static const tran_high_t lgt8_000w7[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 0, 32768 },
-  { 12385, 12385, 12385, 12385, 12385, 12385, 12385, 0 },
-  { 17076, 13694, 7600, 0, -7600, -13694, -17076, 0 },
-  { 15781, 3898, -10921, -17515, -10921, 3898, 15781, 0 },
-  { 13694, -7600, -17076, 0, 17076, 7600, -13694, 0 },
-  { 10921, -15781, -3898, 17515, -3898, -15781, 10921, 0 },
-  { 7600, -17076, 13694, 0, -13694, 17076, -7600, 0 },
-  { 3898, -10921, 15781, -17515, 15781, -10921, 3898, 0 },
-};
-
-// LGT8 name: lgt8_150_000w6
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_150_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 3157, 7688, 11723, 15002, 17312, 18506, 0, 0 },
-  { 9167, 17832, 16604, 6164, -7696, -17286, 0, 0 },
-  { 14236, 15584, -4969, -18539, -6055, 14938, 0, 0 },
-  { 17558, 1891, -18300, 5288, 16225, -11653, 0, 0 },
-  { 17776, -13562, -647, 14380, -17514, 7739, 0, 0 },
-  { 12362, -16318, 17339, -15240, 10399, -3688, 0, 0 },
-};
-
-// LGT8 name: lgt8_100_000w6
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_100_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 4350, 8447, 12053, 14959, 16995, 18044, 0, 0 },
-  { 12053, 18044, 14959, 4350, -8447, -16995, 0, 0 },
-  { 16995, 12053, -8447, -18044, -4350, 14959, 0, 0 },
-  { 18044, -4350, -16995, 8447, 14959, -12053, 0, 0 },
-  { 14959, -16995, 4350, 12053, -18044, 8447, 0, 0 },
-  { 8447, -14959, 18044, -16995, 12053, -4350, 0, 0 },
-};
-
-// LGT8 name: lgt8_060_000w6
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_060_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 6154, 9551, 12487, 14823, 16446, 17277, 0, 0 },
-  { 15149, 17660, 12503, 1917, -9502, -16795, 0, 0 },
-  { 18166, 7740, -11772, -17465, -2656, 15271, 0, 0 },
-  { 16682, -8797, -15561, 10779, 14189, -12586, 0, 0 },
-  { 12436, -18234, 7007, 10763, -18483, 8945, 0, 0 },
-  { 6591, -14172, 18211, -17700, 12766, -4642, 0, 0 },
-};
-
-// LGT8 name: lgt8_000w6
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000
-static const tran_high_t lgt8_000w6[8][8] = {
-  { 0, 0, 0, 0, 0, 0, 23170, 23170 },
-  { 0, 0, 0, 0, 0, 0, 23170, -23170 },
-  { 13377, 13377, 13377, 13377, 13377, 13377, 0, 0 },
-  { 18274, 13377, 4896, -4896, -13377, -18274, 0, 0 },
-  { 16384, 0, -16384, -16384, 0, 16384, 0, 0 },
-  { 13377, -13377, -13377, 13377, 13377, -13377, 0, 0 },
-  { 9459, -18919, 9459, 9459, -18919, 9459, 0, 0 },
-  { 4896, -13377, 18274, -18274, 13377, -4896, 0, 0 },
-};
-
-// LGT8 name: lgt8_150_000w5
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 4109, 9895, 14774, 18299, 20146, 0, 0, 0 },
-  { 11753, 20300, 13161, -4148, -18252, 0, 0, 0 },
-  { 17573, 10921, -16246, -12895, 14679, 0, 0, 0 },
-  { 19760, -9880, -9880, 19760, -9880, 0, 0, 0 },
-  { 14815, -18624, 17909, -12844, 4658, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_100_000w5
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 5567, 10683, 14933, 17974, 19559, 0, 0, 0 },
-  { 14933, 19559, 10683, -5567, -17974, 0, 0, 0 },
-  { 19559, 5567, -17974, -10683, 14933, 0, 0, 0 },
-  { 17974, -14933, -5567, 19559, -10683, 0, 0, 0 },
-  { 10683, -17974, 19559, -14933, 5567, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_060_000w5
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 7650, 11741, 15069, 17415, 18628, 0, 0, 0 },
-  { 17824, 18002, 7558, -7345, -17914, 0, 0, 0 },
-  { 19547, 569, -19303, -8852, 15505, 0, 0, 0 },
-  { 15592, -17548, -2862, 19625, -11374, 0, 0, 0 },
-  { 8505, -17423, 20218, -15907, 6006, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_000w5
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000
-static const tran_high_t lgt8_000w5[8][8] = {
-  { 0, 0, 0, 0, 0, 18919, 18919, 18919 },
-  { 0, 0, 0, 0, 0, 23170, 0, -23170 },
-  { 0, 0, 0, 0, 0, 13377, -26755, 13377 },
-  { 14654, 14654, 14654, 14654, 14654, 0, 0, 0 },
-  { 19710, 12181, 0, -12181, -19710, 0, 0, 0 },
-  { 16766, -6404, -20724, -6404, 16766, 0, 0, 0 },
-  { 12181, -19710, 0, 19710, -12181, 0, 0, 0 },
-  { 6404, -16766, 20724, -16766, 6404, 0, 0, 0 },
-};
-
-// LGT8 name: lgt8_150_000w4
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w4[8][8] = {
-  { 5655, 13343, 19159, 22286, 0, 0, 0, 0 },
-  { 15706, 21362, 2667, -19068, 0, 0, 0, 0 },
-  { 21580, -1459, -20752, 13238, 0, 0, 0, 0 },
-  { 18148, -20910, 16399, -6183, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_100_000w4
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w4[8][8] = {
-  { 7472, 14042, 18919, 21513, 0, 0, 0, 0 },
-  { 18919, 18919, 0, -18919, 0, 0, 0, 0 },
-  { 21513, -7472, -18919, 14042, 0, 0, 0, 0 },
-  { 14042, -21513, 18919, -7472, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_060_000w4
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w4[8][8] = {
-  { 9858, 14855, 18470, 20365, 0, 0, 0, 0 },
-  { 21127, 15855, -2886, -19175, 0, 0, 0, 0 },
-  { 19935, -11679, -17764, 14980, 0, 0, 0, 0 },
-  { 11525, -21570, 20217, -8180, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_000w4
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w4[8][8] = {
-  { 16384, 16384, 16384, 16384, 0, 0, 0, 0 },
-  { 21407, 8867, -8867, -21407, 0, 0, 0, 0 },
-  { 16384, -16384, -16384, 16384, 0, 0, 0, 0 },
-  { 8867, -21407, 21407, -8867, 0, 0, 0, 0 },
-  { 0, 0, 0, 0, 16384, 16384, 16384, 16384 },
-  { 0, 0, 0, 0, 21407, 8867, -8867, -21407 },
-  { 0, 0, 0, 0, 16384, -16384, -16384, 16384 },
-  { 0, 0, 0, 0, 8867, -21407, 21407, -8867 },
-};
-
-// LGT8 name: lgt8_150_000w3
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w3[8][8] = {
-  { 8473, 19144, 25209, 0, 0, 0, 0, 0 },
-  { 21942, 15257, -18961, 0, 0, 0, 0, 0 },
-  { 22815, -21783, 8874, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_100_000w3
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w3[8][8] = {
-  { 10747, 19366, 24149, 0, 0, 0, 0, 0 },
-  { 24149, 10747, -19366, 0, 0, 0, 0, 0 },
-  { 19366, -24149, 10747, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_060_000w3
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w3[8][8] = {
-  { 13363, 19452, 22733, 0, 0, 0, 0, 0 },
-  { 24815, 6704, -20323, 0, 0, 0, 0, 0 },
-  { 16715, -25503, 11997, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_000w3
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w3[8][8] = {
-  { 18919, 18919, 18919, 0, 0, 0, 0, 0 },
-  { 23170, 0, -23170, 0, 0, 0, 0, 0 },
-  { 13377, -26755, 13377, 0, 0, 0, 0, 0 },
-  { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 },
-  { 0, 0, 0, 19710, 12181, 0, -12181, -19710 },
-  { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 },
-  { 0, 0, 0, 12181, -19710, 0, 19710, -12181 },
-  { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 },
-};
-
-// LGT8 name: lgt8_150_000w2
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w2[8][8] = {
-  { 14654, 29309, 0, 0, 0, 0, 0, 0 },
-  { 29309, -14654, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_100_000w2
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w2[8][8] = {
-  { 17227, 27874, 0, 0, 0, 0, 0, 0 },
-  { 27874, -17227, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_060_000w2
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w2[8][8] = {
-  { 19560, 26290, 0, 0, 0, 0, 0, 0 },
-  { 26290, -19560, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_000w2
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w2[8][8] = {
-  { 23170, 23170, 0, 0, 0, 0, 0, 0 },
-  { 23170, -23170, 0, 0, 0, 0, 0, 0 },
-  { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 },
-  { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 },
-  { 0, 0, 16384, 0, -16384, -16384, 0, 16384 },
-  { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 },
-  { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 },
-  { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 },
-};
-
-// LGT8 name: lgt8_150_000w1
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_100_000w1
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_060_000w1
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_000w1
-// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_000w1[8][8] = {
-  { 32768, 0, 0, 0, 0, 0, 0, 0 },
-  { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 },
-  { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 },
-  { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 },
-  { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 },
-  { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 },
-  { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 },
-  { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 },
-};
-
-// LGT8 name: lgt8_060
-// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_060[8][8] = {
-  { 4295, 6746, 8999, 10987, 12653, 13947, 14832, 15280 },
-  { 11303, 15101, 14912, 10786, 3812, -4168, -11047, -15010 },
-  { 15051, 13208, 1823, -10879, -15721, -9207, 3959, 14265 },
-  { 15871, 3800, -13441, -12395, 5516, 15922, 4665, -12939 },
-  { 14630, -7269, -13926, 8618, 13091, -9886, -12133, 11062 },
-  { 12008, -14735, 180, 14586, -12245, -4458, 15932, -8720 },
-  { 8472, -15623, 14088, -4721, -7272, 15221, -14708, 6018 },
-  { 4372, -9862, 13927, -15981, 15727, -13202, 8770, -3071 },
-};
-
-// LGT8 name: lgt8_100
-// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_100[8][8] = {
-  { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827 },
-  { 8368, 14228, 15827, 12684, 5742, -2921, -10708, -15288 },
-  { 12684, 15288, 5742, -8368, -15827, -10708, 2921, 14228 },
-  { 15288, 8368, -10708, -14228, 2921, 15827, 5742, -12684 },
-  { 15827, -2921, -15288, 5742, 14228, -8368, -12684, 10708 },
-  { 14228, -12684, -2921, 15288, -10708, -5742, 15827, -8368 },
-  { 10708, -15827, 12684, -2921, -8368, 15288, -14228, 5742 },
-  { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 },
-};
-#endif  // CONFIG_LGT_FROM_PRED
-
-#if CONFIG_LGT || CONFIG_LGT_FROM_PRED
-// LGT4 name: lgt4_170
-// Self loops: 1.700, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_170[4][4] = {
-  { 3636, 9287, 13584, 15902 },
-  { 10255, 15563, 2470, -13543 },
-  { 14786, 711, -15249, 9231 },
-  { 14138, -14420, 10663, -3920 },
-};
-
-// LGT4 name: lgt4_140
-// Self loops: 1.400, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000
-static const tran_high_t lgt4_140[4][4] = {
-  { 4206, 9518, 13524, 15674 },
-  { 11552, 14833, 1560, -13453 },
-  { 15391, -1906, -14393, 9445 },
-  { 12201, -14921, 12016, -4581 },
-};
-
-// LGT8 name: lgt8_170
-// Self loops: 1.700, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_170[8][8] = {
-  { 1858, 4947, 7850, 10458, 12672, 14411, 15607, 16217 },
-  { 5494, 13022, 16256, 14129, 7343, -1864, -10456, -15601 },
-  { 8887, 16266, 9500, -5529, -15749, -12273, 1876, 14394 },
-  { 11870, 13351, -6199, -15984, -590, 15733, 7273, -12644 },
-  { 14248, 5137, -15991, 291, 15893, -5685, -13963, 10425 },
-  { 15716, -5450, -10010, 15929, -6665, -8952, 16036, -7835 },
-  { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 },
-  { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 },
-};
-
-// LGT8 name: lgt8_150
-// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
-// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000
-static const tran_high_t lgt8_150[8][8] = {
-  { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 },
-  { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 },
-  { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 },
-  { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 },
-  { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 },
-  { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 },
-  { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 },
-  { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 },
-};
-#endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
 #endif  // AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
index 3c99aa155..d367905bc 100644
--- a/third_party/aom/aom_dsp/variance.c
+++ b/third_party/aom/aom_dsp/variance.c
@@ -8,22 +8,24 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
-#include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
 
-#include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/variance.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
@@ -106,12 +108,12 @@ uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
 // It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *filter) {
+void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int pixel_step,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
@@ -136,12 +138,12 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 // filter is applied horizontally (pixel_step = 1) or vertically
 // (pixel_step = stride). It defines the offset required to move from one input
 // to the next. Output is 8-bit.
-static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *filter) {
+void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              unsigned int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
@@ -165,38 +167,55 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
     return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
   }
 
-#define SUBPIX_VAR(W, H)                                                \
-  uint32_t aom_sub_pixel_variance##W##x##H##_c(                         \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters_2t[xoffset]);    \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters_2t[yoffset]);   \
-                                                                        \
-    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
+#define SUBPIX_VAR(W, H)                                                      \
+  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
   }
 
-#define SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                     \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse,                    \
-      const uint8_t *second_pred) {                                     \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters_2t[xoffset]);    \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters_2t[yoffset]);   \
-                                                                        \
-    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
-                                                                        \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
+#define SUBPIX_AVG_VAR(W, H)                                                  \
+  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse,                          \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                    \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);             \
+  }                                                                           \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse,                          \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param);     \
+                                                                              \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                 \
   }
 
 /* Identical to the variance call except it takes an additional parameter, sum,
@@ -229,11 +248,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
   SUBPIX_VAR(W, H)      \
   SUBPIX_AVG_VAR(W, H)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 VARIANCES(128, 128)
 VARIANCES(128, 64)
 VARIANCES(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 VARIANCES(64, 64)
 VARIANCES(64, 32)
 VARIANCES(32, 64)
@@ -250,19 +267,12 @@ VARIANCES(4, 4)
 VARIANCES(4, 2)
 VARIANCES(2, 4)
 VARIANCES(2, 2)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 VARIANCES(4, 16)
 VARIANCES(16, 4)
 VARIANCES(8, 32)
 VARIANCES(32, 8)
 VARIANCES(16, 64)
 VARIANCES(64, 16)
-#if CONFIG_EXT_PARTITION
-VARIANCES(32, 128)
-VARIANCES(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 
 GET_VAR(16, 16)
 GET_VAR(8, 8)
@@ -288,61 +298,142 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 }
 
 // Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
                           int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
                           int ref_stride) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
-    int i;
-    for (i = 0; i < height; i++) {
+    for (int i = 0; i < height; i++) {
       memcpy(comp_pred, ref, width * sizeof(*comp_pred));
       comp_pred += width;
       ref += ref_stride;
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                            -1, width, height);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                           16, width, height);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t,
-                      temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      /*Directly call C versions to allow this to work for small (2x2) sizes.*/
-      aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1),
-                            ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL,
-                            -1, width, intermediate_height);
-      aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
-                           MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y,
-                           16, width, height);
-    }
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
+                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                        intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                       width, height);
   }
 }
 
-void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
                                    int width, int height, int subpel_x_q3,
                                    int subpel_y_q3, const uint8_t *ref,
                                    int ref_stride) {
   int i, j;
 
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
@@ -352,26 +443,68 @@ void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
-static void highbd_variance64(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint64_t *sse, int64_t *sum) {
+void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const JNT_COMP_PARAMS *jcp_param) {
   int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
+void aom_jnt_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
     }
+    tsum += lsum;
     a += a_stride;
     b += b_stride;
   }
+  *sum = tsum;
+  *sse = tsse;
 }
 
 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
@@ -573,65 +706,125 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
                                                dst, dst_stride, sse);        \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                              dst, dst_stride, sse);          \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                      \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                          dst_stride, sse);                   \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H,                    \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);    \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -640,11 +833,9 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
   HIGHBD_SUBPIX_VAR(W, H)      \
   HIGHBD_SUBPIX_AVG_VAR(W, H)
 
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(128, 128)
 HIGHBD_VARIANCES(128, 64)
 HIGHBD_VARIANCES(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(64, 64)
 HIGHBD_VARIANCES(64, 32)
 HIGHBD_VARIANCES(32, 64)
@@ -661,19 +852,12 @@ HIGHBD_VARIANCES(4, 4)
 HIGHBD_VARIANCES(4, 2)
 HIGHBD_VARIANCES(2, 4)
 HIGHBD_VARIANCES(2, 2)
-
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 HIGHBD_VARIANCES(4, 16)
 HIGHBD_VARIANCES(16, 4)
 HIGHBD_VARIANCES(8, 32)
 HIGHBD_VARIANCES(32, 8)
 HIGHBD_VARIANCES(16, 64)
 HIGHBD_VARIANCES(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_VARIANCES(32, 128)
-HIGHBD_VARIANCES(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
@@ -700,9 +884,99 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
   }
 }
 
-void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint16_t *comp_pred, int width, int height,
                                  int subpel_x_q3, int subpel_y_q3,
                                  const uint8_t *ref8, int ref_stride, int bd) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     const uint16_t *ref;
     int i;
@@ -712,57 +986,48 @@ void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
       comp_pred += width;
       ref += ref_stride;
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                               width, kernel, 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                              width, NULL, -1, kernel, 16, width, height, bd);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_highbd_convolve8_horiz_c(ref8, ref_stride,
-                                   CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
-                                   16, NULL, -1, width, height, bd);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      /*Directly call C version to allow this to work for small (2x2) sizes.*/
-      aom_highbd_convolve8_vert_c(ref8, ref_stride,
-                                  CONVERT_TO_BYTEPTR(comp_pred), width, NULL,
-                                  -1, kernel, 16, width, height, bd);
-    } else {
-      DECLARE_ALIGNED(16, uint16_t,
-                      temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      /*Directly call C versions to allow this to work for small (2x2) sizes.*/
-      aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1),
-                                   ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                   MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                   intermediate_height, bd);
-      aom_highbd_convolve8_vert_c(
-          CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
-          MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
-          16, width, height, bd);
-    }
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
+        16, width, height, bd);
   }
 }
 
-void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
-                                          const uint8_t *pred8, int width,
-                                          int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref8,
-                                          int ref_stride, int bd) {
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd) {
   int i, j;
 
   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
@@ -771,69 +1036,109 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
     pred += width;
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
-#if CONFIG_AV1
-void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride,
-                          const uint8_t *mask, int mask_stride,
-                          int invert_mask) {
+void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride,
+                                    const JNT_COMP_PARAMS *jcp_param) {
   int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
     }
     comp_pred += width;
     pred += width;
     ref += ref_stride;
-    mask += mask_stride;
   }
 }
 
-void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, int subpel_x_q3,
-                                    int subpel_y_q3, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask) {
+void aom_highbd_jnt_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
   int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
 
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
     }
     comp_pred += width;
     pred += width;
+  }
+}
+
+void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                          int height, const uint8_t *ref, int ref_stride,
+                          const uint8_t *mask, int mask_stride,
+                          int invert_mask) {
+  int i, j;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
+    }
+    comp_pred += width;
+    src0 += stride0;
+    src1 += stride1;
     mask += mask_stride;
   }
 }
 
-#define MASK_SUBPIX_VAR(W, H)                                                 \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
-                                      bilinear_filters_2t[xoffset]);          \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
-                                       bilinear_filters_2t[yoffset]);         \
-                                                                              \
-    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
-                         invert_mask);                                        \
-    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);         \
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  int mi_row, int mi_col, const MV *const mv,
+                                  uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, int subpel_x_q3,
+                                  int subpel_y_q3, const uint8_t *ref,
+                                  int ref_stride, const uint8_t *mask,
+                                  int mask_stride, int invert_mask) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
+#define MASK_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
+                         invert_mask);                                         \
+    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
   }
 
 MASK_SUBPIX_VAR(4, 4)
@@ -849,26 +1154,16 @@ MASK_SUBPIX_VAR(32, 32)
 MASK_SUBPIX_VAR(32, 64)
 MASK_SUBPIX_VAR(64, 32)
 MASK_SUBPIX_VAR(64, 64)
-#if CONFIG_EXT_PARTITION
 MASK_SUBPIX_VAR(64, 128)
 MASK_SUBPIX_VAR(128, 64)
 MASK_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION_TYPES
 MASK_SUBPIX_VAR(4, 16)
 MASK_SUBPIX_VAR(16, 4)
 MASK_SUBPIX_VAR(8, 32)
 MASK_SUBPIX_VAR(32, 8)
 MASK_SUBPIX_VAR(16, 64)
 MASK_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR(32, 128)
-MASK_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
                                  int width, int height, const uint8_t *ref8,
                                  int ref_stride, const uint8_t *mask,
@@ -891,14 +1186,17 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
 }
 
 void aom_highbd_comp_mask_upsampled_pred_c(
-    uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
-    int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
-    const uint8_t *mask, int mask_stride, int invert_mask, int bd) {
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd) {
   int i, j;
 
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       if (!invert_mask)
@@ -992,28 +1290,16 @@ HIGHBD_MASK_SUBPIX_VAR(32, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 32)
 HIGHBD_MASK_SUBPIX_VAR(64, 64)
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASK_SUBPIX_VAR(64, 128)
 HIGHBD_MASK_SUBPIX_VAR(128, 64)
 HIGHBD_MASK_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
-
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASK_SUBPIX_VAR(4, 16)
 HIGHBD_MASK_SUBPIX_VAR(16, 4)
 HIGHBD_MASK_SUBPIX_VAR(8, 32)
 HIGHBD_MASK_SUBPIX_VAR(32, 8)
 HIGHBD_MASK_SUBPIX_VAR(16, 64)
 HIGHBD_MASK_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR(32, 128)
-HIGHBD_MASK_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1
-
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
+
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
@@ -1044,19 +1330,19 @@ static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
   }
 
-#define OBMC_SUBPIX_VAR(W, H)                                               \
-  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,         \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {        \
-    uint16_t fdata3[(H + 1) * W];                                           \
-    uint8_t temp2[H * W];                                                   \
-                                                                            \
-    var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
-                                      bilinear_filters_2t[xoffset]);        \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,           \
-                                       bilinear_filters_2t[yoffset]);       \
-                                                                            \
-    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);       \
+#define OBMC_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
   }
 
 OBMC_VAR(4, 4)
@@ -1098,7 +1384,6 @@ OBMC_SUBPIX_VAR(64, 32)
 OBMC_VAR(64, 64)
 OBMC_SUBPIX_VAR(64, 64)
 
-#if CONFIG_EXT_PARTITION
 OBMC_VAR(64, 128)
 OBMC_SUBPIX_VAR(64, 128)
 
@@ -1107,9 +1392,7 @@ OBMC_SUBPIX_VAR(128, 64)
 
 OBMC_VAR(128, 128)
 OBMC_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_PARTITION_TYPES
 OBMC_VAR(4, 16)
 OBMC_SUBPIX_VAR(4, 16)
 OBMC_VAR(16, 4)
@@ -1122,15 +1405,7 @@ OBMC_VAR(16, 64)
 OBMC_SUBPIX_VAR(16, 64)
 OBMC_VAR(64, 16)
 OBMC_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-OBMC_VAR(32, 128)
-OBMC_SUBPIX_VAR(32, 128)
-OBMC_VAR(128, 32)
-OBMC_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
-#if CONFIG_HIGHBITDEPTH
+
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
                                           const int32_t *mask, int w, int h,
@@ -1301,7 +1576,6 @@ HIGHBD_OBMC_SUBPIX_VAR(64, 32)
 HIGHBD_OBMC_VAR(64, 64)
 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_OBMC_VAR(64, 128)
 HIGHBD_OBMC_SUBPIX_VAR(64, 128)
 
@@ -1310,9 +1584,7 @@ HIGHBD_OBMC_SUBPIX_VAR(128, 64)
 
 HIGHBD_OBMC_VAR(128, 128)
 HIGHBD_OBMC_SUBPIX_VAR(128, 128)
-#endif  // CONFIG_EXT_PARTITION
 
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_OBMC_VAR(4, 16)
 HIGHBD_OBMC_SUBPIX_VAR(4, 16)
 HIGHBD_OBMC_VAR(16, 4)
@@ -1325,12 +1597,3 @@ HIGHBD_OBMC_VAR(16, 64)
 HIGHBD_OBMC_SUBPIX_VAR(16, 64)
 HIGHBD_OBMC_VAR(64, 16)
 HIGHBD_OBMC_SUBPIX_VAR(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_OBMC_VAR(32, 128)
-HIGHBD_OBMC_SUBPIX_VAR(32, 128)
-HIGHBD_OBMC_VAR(128, 32)
-HIGHBD_OBMC_SUBPIX_VAR(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
index a193df467..544dda944 100644
--- a/third_party/aom/aom_dsp/variance.h
+++ b/third_party/aom/aom_dsp/variance.h
@@ -12,7 +12,7 @@
 #ifndef AOM_DSP_VARIANCE_H_
 #define AOM_DSP_VARIANCE_H_
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
 
@@ -33,10 +33,6 @@ typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
 typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
                                   int b_stride, int n);
 
-typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   unsigned int *sad_array);
-
 typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
                                      const uint8_t *const b_array[],
                                      int b_stride, unsigned int *sad_array);
@@ -54,7 +50,16 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)(
     const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
     int b_stride, unsigned int *sse, const uint8_t *second_pred);
 
-#if CONFIG_AV1
+typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             const uint8_t *second_pred,
+                                             const JNT_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred,
+    const JNT_COMP_PARAMS *jcp_param);
+
 typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
                                             const uint8_t *ref, int ref_stride,
                                             const uint8_t *second_pred,
@@ -64,9 +69,13 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-#endif  // CONFIG_AV1
 
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
+void aom_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask);
+
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
                                           const int32_t *msk);
@@ -78,27 +87,22 @@ typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
 typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
     const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
     const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
 
-#if CONFIG_AV1
 typedef struct aom_variance_vtable {
   aom_sad_fn_t sdf;
   aom_sad_avg_fn_t sdaf;
   aom_variance_fn_t vf;
   aom_subpixvariance_fn_t svf;
   aom_subp_avg_variance_fn_t svaf;
-  aom_sad_multi_fn_t sdx3f;
-  aom_sad_multi_fn_t sdx8f;
   aom_sad_multi_d_fn_t sdx4df;
   aom_masked_sad_fn_t msdf;
   aom_masked_subpixvariance_fn_t msvf;
-#if CONFIG_MOTION_VAR
   aom_obmc_sad_fn_t osdf;
   aom_obmc_variance_fn_t ovf;
   aom_obmc_subpixvariance_fn_t osvf;
-#endif  // CONFIG_MOTION_VAR
+  aom_jnt_sad_avg_fn_t jsdaf;
+  aom_jnt_subp_avg_variance_fn_t jsvaf;
 } aom_variance_fn_ptr_t;
-#endif  // CONFIG_AV1
 
 void aom_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8, uint16_t *output_ptr,
@@ -115,10 +119,8 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
 uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
                           int b_stride, int w, int h);
 
-#if CONFIG_HIGHBITDEPTH
 uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride, int w, int h);
-#endif  // CONFIG_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index 4067b0b53..401fbdc48 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
@@ -20,12 +21,6 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2;
 filter8_1dfunction aom_filter_block1d8_h8_sse2;
 filter8_1dfunction aom_filter_block1d4_v8_sse2;
 filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_avg_sse2;
 
 filter8_1dfunction aom_filter_block1d16_v2_sse2;
 filter8_1dfunction aom_filter_block1d16_h2_sse2;
@@ -33,12 +28,6 @@ filter8_1dfunction aom_filter_block1d8_v2_sse2;
 filter8_1dfunction aom_filter_block1d8_h2_sse2;
 filter8_1dfunction aom_filter_block1d4_v2_sse2;
 filter8_1dfunction aom_filter_block1d4_h2_sse2;
-filter8_1dfunction aom_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
 
 // void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -50,47 +39,16 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
 //                              const int16_t *filter_x, int x_step_q4,
 //                              const int16_t *filter_y, int y_step_q4,
 //                              int w, int h);
-// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
 
-#if CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#if ARCH_X86_64
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2;
 
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
@@ -98,12 +56,6 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
 
 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
 //                                      ptrdiff_t src_stride,
@@ -123,60 +75,8 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
 //                                     const int16_t *filter_y,
 //                                     int y_step_q4,
 //                                     int w, int h, int bd);
-// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h, int bd);
-// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
 
-#if CONFIG_LOOP_RESTORATION
-// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
-// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
-void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h, int bd) {
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  ((int16_t *)filter_x)[3] += 128;
-  ((int16_t *)filter_y)[3] += 128;
-  aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h, bd);
-  ((int16_t *)filter_x)[3] -= 128;
-  ((int16_t *)filter_y)[3] -= 128;
-}
-#endif  // CONFIG_LOOP_RESTORATION
-#endif  // CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#endif  // ARCH_X86_64
 #endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
index 4d3142867..7283c32b8 100644
--- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -50,7 +50,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   cmp r4d, 32
   je .w32
 
-%if CONFIG_AV1 && CONFIG_EXT_PARTITION
   cmp r4d, 64
   je .w64
 %ifidn %2, highbd
@@ -160,50 +159,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   jnz .loop128
   RET
 
-%else  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
-
-%ifidn %2, highbd
-  cmp r4d, 64
-  je .w64
-
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-%endif
-%endif  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
-
 .w64:
   mov                    r4d, dword hm
 .loop64:
@@ -339,7 +294,4 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
 INIT_XMM sse2
 convolve_fn copy
 convolve_fn avg
-%if CONFIG_HIGHBITDEPTH
 convolve_fn copy, highbd
-convolve_fn avg, highbd
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c
deleted file mode 100644
index 14352895d..000000000
--- a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h) {
-  const int bd = 8;
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero),
-                            _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
-        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p, res_8bit);
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index e6d357ba3..b6f040791 100644
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -200,6 +200,8 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void aom_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -392,169 +394,6 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movq        xmm0, [rsi]                 ;load src: row 0
-    movq        xmm1, [rsi + rax]           ;1
-    movq        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2]       ;7
-    movq        xmm2, [rsi + rax]           ;2
-    movq        xmm3, [rsi + rax * 2]       ;3
-    movq        xmm4, [rsi + rdx]           ;4
-    movq        xmm5, [rsi + rax * 4]       ;5
-
-    HIGH_APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 16
-    HIGH_APPLY_FILTER_8 1, 16
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void aom_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
@@ -772,194 +611,3 @@ sym(aom_highbd_filter_block1d16_h8_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm4,   [rsi + 2]
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm4
-    movdqa      xmm7, xmm4
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm4
-
-    psrldq      xmm1, 2
-    psrldq      xmm6, 4
-    psrldq      xmm7, 6
-    psrldq      xmm2, 4
-    psrldq      xmm3, 6
-    psrldq      xmm5, 2
-
-    HIGH_APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 10]           ;load src
-    movdqu      xmm1,   [rsi + 12]
-    movdqu      xmm2,   [rsi + 14]
-    movdqu      xmm3,   [rsi + 16]
-    movdqu      xmm4,   [rsi + 18]
-    movdqu      xmm5,   [rsi + 20]
-    movdqu      xmm6,   [rsi + 22]
-    movdqu      xmm7,   [rsi + 24]
-
-    HIGH_APPLY_FILTER_8 1, 16
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index 9e2ec748c..7b3fe6419 100644
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -174,6 +174,8 @@
 %endm
 %endif
 
+SECTION .text
+
 global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_v2_sse2):
     push        rbp
@@ -254,86 +256,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     ret
 %endif
 
-global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movq        xmm0, [rsi]                 ;load src
-    movq        xmm1, [rsi + 2*rax]
-
-    HIGH_APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;0
-    movdqu      xmm1, [rsi + 2*rax]         ;1
-
-    HIGH_APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + 2*rax]       ;1
-    movdqu        xmm2, [rsi + 16]
-    movdqu        xmm3, [rsi + 2*rax + 16]
-
-    HIGH_APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
-
 global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_h2_sse2):
     push        rbp
@@ -414,84 +336,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2):
     pop         rbp
     ret
 %endif
-
-global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 2
-
-    HIGH_APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqu      xmm1, [rsi + 2]
-
-    HIGH_APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 2]
-    movdqu      xmm2,   [rsi + 16]
-    movdqu      xmm3,   [rsi + 18]
-
-    HIGH_APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
deleted file mode 100644
index 74ce80e50..000000000
--- a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-void aom_highbd_convolve8_add_src_hip_ssse3(
-    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
-
-  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1);
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
-        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
-        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, res_16bit);
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 61476b8be..af45a03ac 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -11,31 +11,12 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_ports/mem.h"
 
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
 #if defined(__clang__)
 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
@@ -566,10 +547,4 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3;
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
 
-// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index be37738df..6bcb4a512 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -11,7 +11,8 @@
 
 #include <tmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve.h"
 #include "aom_mem/aom_mem.h"
@@ -285,20 +286,6 @@ filter8_1dfunction aom_filter_block1d8_v8_ssse3;
 filter8_1dfunction aom_filter_block1d8_h8_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
-#if CONFIG_LOOP_RESTORATION
-filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
-#endif
 
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
@@ -306,12 +293,6 @@ filter8_1dfunction aom_filter_block1d8_v2_ssse3;
 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
 
 // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -323,598 +304,5 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-#if CONFIG_LOOP_RESTORATION
-FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
-                        ssse3);
-FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
-                        src - src_stride * 3, add_src_, ssse3);
-#endif
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas.  The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = h + (8 - (h & 0x7));
-
-  do {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 8) {
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
-    }
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  for (y = 0; y < h; y += 4) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 4) {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
-    }
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-  }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  int i;
-
-  for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i *)&dst[i], temp_hi);
-  }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
-                            w);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
-#if CONFIG_LOOP_RESTORATION
-FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
-#endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
index b946010d3..c88fc9ffb 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -179,6 +179,8 @@
     movq        [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void aom_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -368,166 +370,6 @@ sym(aom_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 1, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void aom_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
@@ -771,220 +613,3 @@ sym(aom_filter_block1d16_h8_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 8688fb544..3ca7921b6 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -375,17 +375,8 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
 
 INIT_XMM ssse3
 SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
 SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
 SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
-
-%if CONFIG_LOOP_RESTORATION
-SUBPIX_HFILTER16 h8_add_src
-SUBPIX_HFILTER8  h8_add_src
-SUBPIX_HFILTER4  h8_add_src
-%endif
 
 ;-------------------------------------------------------------------------------
 
@@ -875,15 +866,5 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
 
 INIT_XMM ssse3
 SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
 SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
 SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
-
-%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
-    CONFIG_LOOP_RESTORATION
-SUBPIX_VFILTER16 v8_add_src
-SUBPIX_VFILTER   v8_add_src, 8
-SUBPIX_VFILTER   v8_add_src, 4
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
index 8f025a8be..d0b4b2839 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -134,6 +134,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(aom_filter_block1d4_v2_sse2) PRIVATE
 sym(aom_filter_block1d4_v2_sse2):
     push        rbp
@@ -212,84 +214,6 @@ sym(aom_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 global sym(aom_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_filter_block1d4_h2_sse2):
     push        rbp
@@ -369,83 +293,3 @@ sym(aom_filter_block1d16_h2_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index b9b2da0be..59edc49a9 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -108,6 +108,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
 sym(aom_filter_block1d4_v2_ssse3):
     push        rbp
@@ -185,83 +187,6 @@ sym(aom_filter_block1d16_v2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d4_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d8_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d16_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
 sym(aom_filter_block1d4_h2_ssse3):
     push        rbp
@@ -340,82 +265,3 @@ sym(aom_filter_block1d16_h2_ssse3):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d4_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d8_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d16_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
deleted file mode 100644
index 1a6457402..000000000
--- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
-
-void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
-                         int *min, int *max) {
-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
-  u0 = _mm_setzero_si128();
-  // Row 0
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff0 = _mm_max_epi16(diff, negdiff);
-  // Row 1
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
-  // Row 2
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 3
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 4
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 5
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 6
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 7
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
-  *max = _mm_extract_epi16(maxabsdiff, 0);
-
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
-  *min = _mm_extract_epi16(minabsdiff, 0);
-}
-
-static void hadamard_col8_sse2(__m128i *in, int iter) {
-  __m128i a0 = in[0];
-  __m128i a1 = in[1];
-  __m128i a2 = in[2];
-  __m128i a3 = in[3];
-  __m128i a4 = in[4];
-  __m128i a5 = in[5];
-  __m128i a6 = in[6];
-  __m128i a7 = in[7];
-
-  __m128i b0 = _mm_add_epi16(a0, a1);
-  __m128i b1 = _mm_sub_epi16(a0, a1);
-  __m128i b2 = _mm_add_epi16(a2, a3);
-  __m128i b3 = _mm_sub_epi16(a2, a3);
-  __m128i b4 = _mm_add_epi16(a4, a5);
-  __m128i b5 = _mm_sub_epi16(a4, a5);
-  __m128i b6 = _mm_add_epi16(a6, a7);
-  __m128i b7 = _mm_sub_epi16(a6, a7);
-
-  a0 = _mm_add_epi16(b0, b2);
-  a1 = _mm_add_epi16(b1, b3);
-  a2 = _mm_sub_epi16(b0, b2);
-  a3 = _mm_sub_epi16(b1, b3);
-  a4 = _mm_add_epi16(b4, b6);
-  a5 = _mm_add_epi16(b5, b7);
-  a6 = _mm_sub_epi16(b4, b6);
-  a7 = _mm_sub_epi16(b5, b7);
-
-  if (iter == 0) {
-    b0 = _mm_add_epi16(a0, a4);
-    b7 = _mm_add_epi16(a1, a5);
-    b3 = _mm_add_epi16(a2, a6);
-    b4 = _mm_add_epi16(a3, a7);
-    b2 = _mm_sub_epi16(a0, a4);
-    b6 = _mm_sub_epi16(a1, a5);
-    b1 = _mm_sub_epi16(a2, a6);
-    b5 = _mm_sub_epi16(a3, a7);
-
-    a0 = _mm_unpacklo_epi16(b0, b1);
-    a1 = _mm_unpacklo_epi16(b2, b3);
-    a2 = _mm_unpackhi_epi16(b0, b1);
-    a3 = _mm_unpackhi_epi16(b2, b3);
-    a4 = _mm_unpacklo_epi16(b4, b5);
-    a5 = _mm_unpacklo_epi16(b6, b7);
-    a6 = _mm_unpackhi_epi16(b4, b5);
-    a7 = _mm_unpackhi_epi16(b6, b7);
-
-    b0 = _mm_unpacklo_epi32(a0, a1);
-    b1 = _mm_unpacklo_epi32(a4, a5);
-    b2 = _mm_unpackhi_epi32(a0, a1);
-    b3 = _mm_unpackhi_epi32(a4, a5);
-    b4 = _mm_unpacklo_epi32(a2, a3);
-    b5 = _mm_unpacklo_epi32(a6, a7);
-    b6 = _mm_unpackhi_epi32(a2, a3);
-    b7 = _mm_unpackhi_epi32(a6, a7);
-
-    in[0] = _mm_unpacklo_epi64(b0, b1);
-    in[1] = _mm_unpackhi_epi64(b0, b1);
-    in[2] = _mm_unpacklo_epi64(b2, b3);
-    in[3] = _mm_unpackhi_epi64(b2, b3);
-    in[4] = _mm_unpacklo_epi64(b4, b5);
-    in[5] = _mm_unpackhi_epi64(b4, b5);
-    in[6] = _mm_unpacklo_epi64(b6, b7);
-    in[7] = _mm_unpackhi_epi64(b6, b7);
-  } else {
-    in[0] = _mm_add_epi16(a0, a4);
-    in[7] = _mm_add_epi16(a1, a5);
-    in[3] = _mm_add_epi16(a2, a6);
-    in[4] = _mm_add_epi16(a3, a7);
-    in[2] = _mm_sub_epi16(a0, a4);
-    in[6] = _mm_sub_epi16(a1, a5);
-    in[1] = _mm_sub_epi16(a2, a6);
-    in[5] = _mm_sub_epi16(a3, a7);
-  }
-}
-
-void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
-                           int16_t *coeff) {
-  __m128i src[8];
-  src[0] = _mm_load_si128((const __m128i *)src_diff);
-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-
-  hadamard_col8_sse2(src, 0);
-  hadamard_col8_sse2(src, 1);
-
-  _mm_store_si128((__m128i *)coeff, src[0]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
-}
-
-void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
-
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm_srai_epi16(b0, 1);
-    b1 = _mm_srai_epi16(b1, 1);
-    b2 = _mm_srai_epi16(b2, 1);
-    b3 = _mm_srai_epi16(b3, 1);
-
-    coeff0 = _mm_add_epi16(b0, b2);
-    coeff1 = _mm_add_epi16(b1, b3);
-    _mm_store_si128((__m128i *)coeff, coeff0);
-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
-
-    coeff2 = _mm_sub_epi16(b0, b2);
-    coeff3 = _mm_sub_epi16(b1, b3);
-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
-
-    coeff += 8;
-  }
-}
-
-int aom_satd_sse2(const int16_t *coeff, int length) {
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i accum = zero;
-
-  for (i = 0; i < length; i += 8) {
-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
-    const __m128i inv = _mm_sub_epi16(zero, src_line);
-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
-    accum = _mm_add_epi32(accum, sum);
-    coeff += 8;
-  }
-
-  {  // cascading summation of accum
-    __m128i hi = _mm_srli_si128(accum, 8);
-    accum = _mm_add_epi32(accum, hi);
-    hi = _mm_srli_epi64(accum, 32);
-    accum = _mm_add_epi32(accum, hi);
-  }
-
-  return _mm_cvtsi128_si32(accum);
-}
-
-void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride,
-                          int height) {
-  int idx;
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
-  __m128i t0, t1;
-  int height_1 = height - 1;
-  ref += ref_stride;
-
-  for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-  }
-
-  src_line = _mm_loadu_si128((const __m128i *)ref);
-  t0 = _mm_unpacklo_epi8(src_line, zero);
-  t1 = _mm_unpackhi_epi8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, t0);
-  s1 = _mm_adds_epu16(s1, t1);
-
-  if (height == 64) {
-    s0 = _mm_srai_epi16(s0, 5);
-    s1 = _mm_srai_epi16(s1, 5);
-  } else if (height == 32) {
-    s0 = _mm_srai_epi16(s0, 4);
-    s1 = _mm_srai_epi16(s1, 4);
-  } else {
-    s0 = _mm_srai_epi16(s0, 3);
-    s1 = _mm_srai_epi16(s1, 3);
-  }
-
-  _mm_storeu_si128((__m128i *)hbuf, s0);
-  hbuf += 8;
-  _mm_storeu_si128((__m128i *)hbuf, s1);
-}
-
-int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
-  __m128i s0 = _mm_sad_epu8(src_line, zero);
-  __m128i s1;
-  int i;
-
-  for (i = 16; i < width; i += 16) {
-    ref += 16;
-    src_line = _mm_load_si128((const __m128i *)ref);
-    s1 = _mm_sad_epu8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, s1);
-  }
-
-  s1 = _mm_srli_si128(s0, 8);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  return _mm_extract_epi16(s0, 0);
-}
-
-int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) {
-  int idx;
-  int width = 4 << bwl;
-  int16_t mean;
-  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
-  __m128i v1 = _mm_load_si128((const __m128i *)src);
-  __m128i diff = _mm_subs_epi16(v0, v1);
-  __m128i sum = diff;
-  __m128i sse = _mm_madd_epi16(diff, diff);
-
-  ref += 8;
-  src += 8;
-
-  for (idx = 8; idx < width; idx += 8) {
-    v0 = _mm_loadu_si128((const __m128i *)ref);
-    v1 = _mm_load_si128((const __m128i *)src);
-    diff = _mm_subs_epi16(v0, v1);
-
-    sum = _mm_add_epi16(sum, diff);
-    v0 = _mm_madd_epi16(diff, diff);
-    sse = _mm_add_epi32(sse, v0);
-
-    ref += 8;
-    src += 8;
-  }
-
-  v0 = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, v0);
-  v0 = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, v0);
-  v0 = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, v0);
-
-  v1 = _mm_srli_si128(sse, 8);
-  sse = _mm_add_epi32(sse, v1);
-  v1 = _mm_srli_epi64(sse, 32);
-  sse = _mm_add_epi32(sse, v1);
-
-  mean = _mm_extract_epi16(sum, 0);
-
-  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
-}
diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
deleted file mode 100644
index b2d150296..000000000
--- a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%define private_prefix aom
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the hadamard transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
-SECTION .text
-
-%if ARCH_X86_64
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro HMD8_1D 0
-  psubw              m8, m0, m1
-  psubw              m9, m2, m3
-  paddw              m0, m1
-  paddw              m2, m3
-  SWAP               1, 8
-  SWAP               3, 9
-  psubw              m8, m4, m5
-  psubw              m9, m6, m7
-  paddw              m4, m5
-  paddw              m6, m7
-  SWAP               5, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m2
-  psubw              m9, m1, m3
-  paddw              m0, m2
-  paddw              m1, m3
-  SWAP               2, 8
-  SWAP               3, 9
-  psubw              m8, m4, m6
-  psubw              m9, m5, m7
-  paddw              m4, m6
-  paddw              m5, m7
-  SWAP               6, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m4
-  psubw              m9, m1, m5
-  paddw              m0, m4
-  paddw              m1, m5
-  SWAP               4, 8
-  SWAP               5, 9
-  psubw              m8, m2, m6
-  psubw              m9, m3, m7
-  paddw              m2, m6
-  paddw              m3, m7
-  SWAP               6, 8
-  SWAP               7, 9
-%endmacro
-
-INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  HMD8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-  HMD8_1D
-
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-
-  RET
-%endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
index e916e4ff9..4f5e3f8c1 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -11,7 +11,7 @@
 
 #include "aom/aom_integer.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 // To start out, just dispatch to the function using the 2D mask and
 // pass mask stride as 0. This can be improved upon if necessary.
@@ -19,18 +19,16 @@
 void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, h, w, 0, 0);
+                            src1_stride, mask, 0, w, h, 0, 0);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
-                                   src1_8, src1_stride, mask, 0, h, w, 0, 0,
+                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
                                    bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
index 68d74e517..49c20b467 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
@@ -31,7 +31,7 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src0, uint32_t src0_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
+                                     int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -58,7 +58,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src0, uint32_t src0_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
+                                     int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -84,7 +84,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
 static void blend_a64_mask_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -119,7 +119,7 @@ static void blend_a64_mask_w16n_sse4_1(
 static void blend_a64_mask_sx_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -149,7 +149,7 @@ static void blend_a64_mask_sx_w4_sse4_1(
 static void blend_a64_mask_sx_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -179,7 +179,7 @@ static void blend_a64_mask_sx_w8_sse4_1(
 static void blend_a64_mask_sx_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -219,7 +219,7 @@ static void blend_a64_mask_sx_w16n_sse4_1(
 static void blend_a64_mask_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -248,7 +248,7 @@ static void blend_a64_mask_sy_w4_sse4_1(
 static void blend_a64_mask_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -277,7 +277,7 @@ static void blend_a64_mask_sy_w8_sse4_1(
 static void blend_a64_mask_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zero = _mm_setzero_si128();
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -315,7 +315,7 @@ static void blend_a64_mask_sy_w16n_sse4_1(
 static void blend_a64_mask_sx_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -350,7 +350,7 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
 static void blend_a64_mask_sx_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -385,7 +385,7 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
 static void blend_a64_mask_sx_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -435,12 +435,12 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                const uint8_t *src0, uint32_t src0_stride,
                                const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int h,
-                               int w, int suby, int subx) {
+                               const uint8_t *mask, uint32_t mask_stride, int w,
+                               int h, int subx, int suby) {
   typedef void (*blend_fn)(
       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
   // Dimensions are: width_index X subx X suby
   static const blend_fn blend[3][2][2] = {
@@ -465,15 +465,14 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, h, w, suby, subx);
+                         mask, mask_stride, w, h, subx, suby);
   } else {
     blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
                                               src0_stride, src1, src1_stride,
-                                              mask, mask_stride, h, w);
+                                              mask, mask_stride, w, h);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -503,7 +502,7 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1(
 static void blend_a64_mask_b10_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                               src1_stride, mask, mask_stride, h, blend_4_b10);
@@ -512,7 +511,7 @@ static void blend_a64_mask_b10_w4_sse4_1(
 static void blend_a64_mask_b12_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                               src1_stride, mask, mask_stride, h, blend_4_b12);
@@ -521,7 +520,7 @@ static void blend_a64_mask_b12_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -546,18 +545,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1(
 static void blend_a64_mask_b10_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
+                               src1_stride, mask, mask_stride, w, h,
                                blend_8_b10);
 }
 
 static void blend_a64_mask_b12_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
+                               src1_stride, mask, mask_stride, w, h,
                                blend_8_b12);
 }
 
@@ -594,7 +593,7 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
 static void blend_a64_mask_b10_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -604,7 +603,7 @@ static void blend_a64_mask_b10_sx_w4_sse4_1(
 static void blend_a64_mask_b12_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -614,7 +613,7 @@ static void blend_a64_mask_b12_sx_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
@@ -643,18 +642,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
 static void blend_a64_mask_b10_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b12);
 }
 
@@ -690,7 +689,7 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
 static void blend_a64_mask_b10_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -700,7 +699,7 @@ static void blend_a64_mask_b10_sy_w4_sse4_1(
 static void blend_a64_mask_b12_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -710,7 +709,7 @@ static void blend_a64_mask_b12_sy_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -738,18 +737,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
 static void blend_a64_mask_b10_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b12);
 }
 
@@ -791,7 +790,7 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -801,7 +800,7 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -811,7 +810,7 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
@@ -845,18 +844,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
+                                     src1_stride, mask, mask_stride, w, h,
                                      blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
+                                     src1_stride, mask, mask_stride, w, h,
                                      blend_8_b12);
 }
 
@@ -869,12 +868,12 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
                                       uint32_t src0_stride,
                                       const uint8_t *src1_8,
                                       uint32_t src1_stride, const uint8_t *mask,
-                                      uint32_t mask_stride, int h, int w,
-                                      int suby, int subx, int bd) {
+                                      uint32_t mask_stride, int w, int h,
+                                      int subx, int suby, int bd) {
   typedef void (*blend_fn)(
       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
   // Dimensions are: bd_index X width_index X subx X suby
   static const blend_fn blend[2][2][2][2] = {
@@ -909,8 +908,8 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
   assert(bd == 8 || bd == 10 || bd == 12);
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, h, w, suby,
-                                subx, bd);
+                                src1_stride, mask, mask_stride, w, h, subx,
+                                suby, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -918,7 +917,113 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
 
     blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-        mask_stride, h, w);
+        mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0,
+                                      const CONV_BUF_TYPE *src1,
+                                      const __m128i *m,
+                                      const __m128i *v_round_offset,
+                                      const __m128i *v_maxval, int round_bits) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS);
+  const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset);
+  const __m128i res_d = xx_roundn_epi32(res_c, round_bits);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_ro_a = xx_loadl_32(&round_offset);
+  const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0);
+  const __m128i one_w = _mm_set1_epi16(1);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]);
+        const __m128i m = _mm_cvtepu8_epi16(m0);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 =
+            xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]);
+        const __m128i m_i1 =
+            xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+        const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b);
+        const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd);
+        const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+        const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
+        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]);
+        const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]);
+        const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b);
+        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
+        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 9dabe5b79..59506bdfe 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
@@ -30,7 +30,7 @@
 static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                       const uint8_t *src0, uint32_t src0_stride,
                                       const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
+                                      const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -55,7 +55,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
 static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                       const uint8_t *src0, uint32_t src0_stride,
                                       const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
+                                      const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -82,7 +82,7 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                         uint32_t src0_stride,
                                         const uint8_t *src1,
                                         uint32_t src1_stride,
-                                        const uint8_t *mask, int h, int w) {
+                                        const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -112,11 +112,11 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
 void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
+                           const uint8_t *mask, int w, int h);
 
   // Dimension: width_index
   static const blend_fn blend[9] = {
@@ -139,11 +139,10 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
-                 w);
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+                 h);
 }
 
-#if CONFIG_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -174,7 +173,7 @@ static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                           uint32_t src0_stride,
                                           const uint16_t *src1,
                                           uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
+                                          const uint8_t *mask, int w, int h) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, h, blend_4_b10);
@@ -185,7 +184,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                           uint32_t src0_stride,
                                           const uint16_t *src1,
                                           uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
+                                          const uint8_t *mask, int w, int h) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, h, blend_4_b12);
@@ -194,7 +193,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
 static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -218,9 +217,9 @@ static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                            uint32_t src0_stride,
                                            const uint16_t *src1,
                                            uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
+                                           const uint8_t *mask, int w, int h) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b10);
+                                src1_stride, mask, w, h, blend_8_b10);
 }
 
 static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
@@ -228,9 +227,9 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                            uint32_t src0_stride,
                                            const uint16_t *src1,
                                            uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
+                                           const uint8_t *mask, int w, int h) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b12);
+                                src1_stride, mask, w, h, blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -240,11 +239,11 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
 void aom_highbd_blend_a64_vmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
                            const uint16_t *src0, uint32_t src0_stride,
                            const uint16_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
+                           const uint8_t *mask, int w, int h);
 
   // Dimensions are: bd_index X width_index
   static const blend_fn blend[2][2] = {
@@ -272,14 +271,13 @@ void aom_highbd_blend_a64_vmask_sse4_1(
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                 src1_stride, mask, h, w, bd);
+                                 src1_stride, mask, w, h, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
     blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, h, w);
+                                  src1_stride, mask, w, h);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
index daa2b2b3a..4880438bc 100644
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -53,7 +53,6 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
   return v_res_w;
 }
 
-#if CONFIG_HIGHBITDEPTH
 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w);
 
@@ -141,6 +140,5 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
 
   return v_res_w;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 #endif  // AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
index 5f9596a74..3f46420dd 100644
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -14,7 +14,7 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 // Note: in and out could have the same value
 static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
index 8641164db..36fb1963a 100644
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -13,7 +13,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "aom_dsp/aom_convolve.h"
@@ -84,102 +85,6 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                \
-  void aom_convolve8_##avg##opt(                                             \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                   \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                   \
-    assert(w <= MAX_SB_SIZE);                                                \
-    assert(h <= MAX_SB_SIZE);                                                \
-    assert(x_step_q4 == 16);                                                 \
-    assert(y_step_q4 == 16);                                                 \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] ||          \
-        filter_y[1] || filter_y[2]) {                                        \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
-      aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2,    \
-                                MAX_SB_SIZE, filter_x, x_step_q4, filter_y,  \
-                                y_step_q4, w, h + 7);                        \
-      aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                      dst, dst_stride, filter_x, x_step_q4,  \
-                                      filter_y, y_step_q4, w, h);            \
-    } else {                                                                 \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
-      aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE,        \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                h + 1);                                      \
-      aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride,  \
-                                      filter_x, x_step_q4, filter_y,         \
-                                      y_step_q4, w, h);                      \
-    }                                                                        \
-  }
-
-#if CONFIG_LOOP_RESTORATION
-// convolve_add_src is only used by the Wiener filter, which will never
-// end up calling the bilinear functions (it uses a symmetric filter, so
-// the possible numbers of taps are 1,3,5,7)
-#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
-                                opt)                                        \
-  void aom_convolve8_##name##_##opt(                                        \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    (void)filter_x;                                                         \
-    (void)x_step_q4;                                                        \
-    (void)filter_y;                                                         \
-    (void)y_step_q4;                                                        \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                      \
-    assert(step_q4 == 16);                                                  \
-    while (w >= 16) {                                                       \
-      aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                               dst_stride, h, filter);      \
-      src += 16;                                                            \
-      dst += 16;                                                            \
-      w -= 16;                                                              \
-    }                                                                       \
-    while (w >= 8) {                                                        \
-      aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 8;                                                             \
-      dst += 8;                                                             \
-      w -= 8;                                                               \
-    }                                                                       \
-    while (w >= 4) {                                                        \
-      aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 4;                                                             \
-      dst += 4;                                                             \
-      w -= 4;                                                               \
-    }                                                                       \
-    if (w) {                                                                \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,  \
-                               x_step_q4, filter_y, y_step_q4, w, h);       \
-    }                                                                       \
-  }
-
-#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt)                           \
-  void aom_convolve8_##type##opt(                                           \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);  \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                  \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                  \
-    assert(w <= MAX_SB_SIZE);                                               \
-    assert(h <= MAX_SB_SIZE);                                               \
-    assert(x_step_q4 == 16);                                                \
-    assert(y_step_q4 == 16);                                                \
-    aom_convolve8_##htype##horiz_##opt(                                     \
-        src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x,    \
-        x_step_q4, filter_y, y_step_q4, w, h + 7);                          \
-    aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                     dst, dst_stride, filter_x, x_step_q4,  \
-                                     filter_y, y_step_q4, w, h);            \
-  }
-#endif
-
-#if CONFIG_HIGHBITDEPTH
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
@@ -248,41 +153,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     }                                                                      \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void aom_highbd_convolve8_##avg##opt(                                       \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= MAX_SB_SIZE);                                                 \
-    assert(h <= MAX_SB_SIZE);                                                 \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 ||  \
-          filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t,                                         \
-                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);             \
-        aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
-                                         CONVERT_TO_BYTEPTR(fdata2),          \
-                                         MAX_SB_SIZE, filter_x, x_step_q4,    \
-                                         filter_y, y_step_q4, w, h + 7, bd);  \
-        aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst,   \
-            dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);  \
-      } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t,                                         \
-                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]);             \
-        aom_highbd_convolve8_horiz_##opt(                                     \
-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE,         \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd);          \
-        aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride,         \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);              \
-      }                                                                       \
-    } else {                                                                  \
-      aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
-    }                                                                         \
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
 #endif  // AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..7790baf2e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+static INLINE void prepare_coeffs_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16(0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+                                     const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+                                       _mm256_add_epi16(res_23, res_67));
+
+  return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+                               const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                       _mm256_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+                                       const __m256i *const coeffs,
+                                       const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+  s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_lowbd(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+                                         const __m256i *const res,
+                                         const int do_average) {
+  __m256i d;
+  if (do_average) {
+    d = _mm256_load_si256((__m256i *)dst);
+    d = _mm256_add_epi32(d, *res);
+    d = _mm256_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+                               const __m256i *const res_unsigned,
+                               const __m256i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm256_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+                                        const __m256i *const offset_const,
+                                        const __m256i *const round_const,
+                                        const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi16(
+      _mm256_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+                                      const __m256i *const res_unsigned,
+                                      const __m256i *const wt0,
+                                      const __m256i *const wt1,
+                                      const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+    const __m256i *const res_unsigned, const __m256i *const offset_const,
+    const __m256i *const round_const, const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi32(
+      _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 000000000..e80c5872f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+                             const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(d, *res);
+    d = _mm_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 000000000..846fe7bb4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+  const __m128i res =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+                               const __m128i *const res_unsigned,
+                               const __m128i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+                                        const __m128i *const offset_const,
+                                        const __m128i *const round_const,
+                                        const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+    const __m128i *const res_unsigned, const __m128i *const offset_const,
+    const __m128i *const round_const, const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 000000000..d48c25667
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+                                  const __m128i *const res,
+                                  const __m128i *const wt0,
+                                  const __m128i *const wt1,
+                                  const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+                                             const __m128i *const res_unsigned,
+                                             const __m128i *const wt0,
+                                             const __m128i *const wt1,
+                                             const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..54da02253
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+          _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..12bdc3e18
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = _mm_sub_ps(real1, real2);
+      imag1 = _mm_add_ps(imag1, imag2);
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = _mm_add_ps(real1, real2);
+      imag1 = _mm_sub_ps(imag2, imag1);
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
deleted file mode 100644
index b8ec08de7..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
+++ /dev/null
@@ -1,862 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its output - the caller is expected to do that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void fdct32_8col(__m128i *in0, __m128i *in1) {
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i step1[32];
-  __m128i step2[32];
-  __m128i step3[32];
-  __m128i out[32];
-  // Stage 1
-  {
-    const __m128i *ina = in0;
-    const __m128i *inb = in1 + 15;
-    __m128i *step1a = &step1[0];
-    __m128i *step1b = &step1[31];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 4;
-    const __m128i *inb = in1 + 11;
-    __m128i *step1a = &step1[4];
-    __m128i *step1b = &step1[27];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 8;
-    const __m128i *inb = in1 + 7;
-    __m128i *step1a = &step1[8];
-    __m128i *step1b = &step1[23];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 12;
-    const __m128i *inb = in1 + 3;
-    __m128i *step1a = &step1[12];
-    __m128i *step1b = &step1[19];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  // Stage 2
-  {
-    step2[0] = _mm_add_epi16(step1[0], step1[15]);
-    step2[1] = _mm_add_epi16(step1[1], step1[14]);
-    step2[2] = _mm_add_epi16(step1[2], step1[13]);
-    step2[3] = _mm_add_epi16(step1[3], step1[12]);
-    step2[4] = _mm_add_epi16(step1[4], step1[11]);
-    step2[5] = _mm_add_epi16(step1[5], step1[10]);
-    step2[6] = _mm_add_epi16(step1[6], step1[9]);
-    step2[7] = _mm_add_epi16(step1[7], step1[8]);
-    step2[8] = _mm_sub_epi16(step1[7], step1[8]);
-    step2[9] = _mm_sub_epi16(step1[6], step1[9]);
-    step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-    step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-    step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-    step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-    step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-    step2[15] = _mm_sub_epi16(step1[0], step1[15]);
-  }
-  {
-    const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-    const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-    const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-    const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-    const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-    const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-    const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-    const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-    const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-    const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-    const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-    const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-    const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-    const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-    const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-    const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-    const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-    const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-    const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-    const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-    const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-    const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-    const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-    const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-    const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-    const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-    const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-    const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-    const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-    const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-    const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-    const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-    const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-    const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-    const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-    const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-    const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-    const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-    const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-    // Combine
-    step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-    step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-    step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-    step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-    step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-    step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-    step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-    step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-  }
-  // Stage 3
-  {
-    step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-    step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-    step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-    step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-    step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-    step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-    step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-    step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
-  }
-  {
-    const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-    const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-    const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-    const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-    const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-    const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-    const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-    const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-    const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-    const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-    const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-    const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-    const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-    const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-    const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-    const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-    const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-    const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-    const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-    // Combine
-    step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-    step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-    step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-    step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-  }
-  {
-    step3[16] = _mm_add_epi16(step2[23], step1[16]);
-    step3[17] = _mm_add_epi16(step2[22], step1[17]);
-    step3[18] = _mm_add_epi16(step2[21], step1[18]);
-    step3[19] = _mm_add_epi16(step2[20], step1[19]);
-    step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-    step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-    step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-    step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-    step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-    step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-    step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-    step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-    step3[28] = _mm_add_epi16(step2[27], step1[28]);
-    step3[29] = _mm_add_epi16(step2[26], step1[29]);
-    step3[30] = _mm_add_epi16(step2[25], step1[30]);
-    step3[31] = _mm_add_epi16(step2[24], step1[31]);
-  }
-
-  // Stage 4
-  {
-    step1[0] = _mm_add_epi16(step3[3], step3[0]);
-    step1[1] = _mm_add_epi16(step3[2], step3[1]);
-    step1[2] = _mm_sub_epi16(step3[1], step3[2]);
-    step1[3] = _mm_sub_epi16(step3[0], step3[3]);
-    step1[8] = _mm_add_epi16(step3[11], step2[8]);
-    step1[9] = _mm_add_epi16(step3[10], step2[9]);
-    step1[10] = _mm_sub_epi16(step2[9], step3[10]);
-    step1[11] = _mm_sub_epi16(step2[8], step3[11]);
-    step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-    step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-    step1[14] = _mm_add_epi16(step3[13], step2[14]);
-    step1[15] = _mm_add_epi16(step3[12], step2[15]);
-  }
-  {
-    const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-    const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-    const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-    const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-    const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-    const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-    const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-    const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-    const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-    // Combine
-    step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-    step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-  }
-  {
-    const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-    const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-    const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-    const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-    const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-    const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-    const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-    const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-    const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-    const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-    const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-    const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-    const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-    const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-    const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-    const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-    const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-    const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-    const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-    const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-    const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-    const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-    const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-    const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-    const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-    const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-    const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-    const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-    const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-    const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-    const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-    const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-    const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-    const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-    const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-    const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-    const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-    const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-    const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-    // Combine
-    step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-    step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-    step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-    step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-    step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-    step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-    step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-    step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-  }
-  // Stage 5
-  {
-    step2[4] = _mm_add_epi16(step1[5], step3[4]);
-    step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-    step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-    step2[7] = _mm_add_epi16(step1[6], step3[7]);
-  }
-  {
-    const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-    const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-    const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-    const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-    const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-    const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-    const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-    const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-    const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-    const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-    const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-    const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-    // dct_const_round_shift
-    const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-    const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-    const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-    const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-    const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-    const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-    const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-    const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-    // Combine
-    out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-    out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-    out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-    out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-  }
-  {
-    const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-    const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-    const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-    const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-    const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-    const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-    const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-    const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-    const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-    const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-    const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-    const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-    const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-    const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-    const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-    const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-    const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-    const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-    const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-    // Combine
-    step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-    step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-    step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-    step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-  }
-  {
-    step2[16] = _mm_add_epi16(step1[19], step3[16]);
-    step2[17] = _mm_add_epi16(step1[18], step3[17]);
-    step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-    step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-    step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-    step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-    step2[22] = _mm_add_epi16(step1[21], step3[22]);
-    step2[23] = _mm_add_epi16(step1[20], step3[23]);
-    step2[24] = _mm_add_epi16(step1[27], step3[24]);
-    step2[25] = _mm_add_epi16(step1[26], step3[25]);
-    step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-    step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-    step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-    step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-    step2[30] = _mm_add_epi16(step1[29], step3[30]);
-    step2[31] = _mm_add_epi16(step1[28], step3[31]);
-  }
-  // Stage 6
-  {
-    const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-    const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-    const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-    const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-    const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-    const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-    const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-    const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-    // dct_const_round_shift
-    const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-    const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-    const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-    const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-    const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-    const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-    const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-    const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-    // Combine
-    out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-    out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-    out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-    out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-  }
-  {
-    step3[8] = _mm_add_epi16(step2[9], step1[8]);
-    step3[9] = _mm_sub_epi16(step1[8], step2[9]);
-    step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-    step3[11] = _mm_add_epi16(step2[10], step1[11]);
-    step3[12] = _mm_add_epi16(step2[13], step1[12]);
-    step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-    step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-    step3[15] = _mm_add_epi16(step2[14], step1[15]);
-  }
-  {
-    const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-    const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-    const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-    const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-    const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-    const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-    const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-    const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-    const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-    const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-    const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-    const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-    const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-    const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-    const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-    const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-    const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-    const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-    const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-    const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-    const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-    const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-    const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-    const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-    // dct_const_round_shift
-    const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-    const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-    const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-    const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-    const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-    const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-    const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-    const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-    const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-    const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-    const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-    const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-    const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-    const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-    const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-    const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-    // Combine
-    step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-    step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-    step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-    step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-    // Combine
-    step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-    step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-    step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-    step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-  }
-  // Stage 7
-  {
-    const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-    const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-    const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-    const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-    const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-    const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-    const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-    const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-    const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-    const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-    const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-    const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-    const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-    const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-    const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-    const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-    const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-    const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-    const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-    const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-    const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-    const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-    const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-    const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-    // dct_const_round_shift
-    const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-    const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-    const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-    const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-    const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-    const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-    const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-    const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-    const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-    const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-    const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-    const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-    const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-    const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-    const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-    const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-    // Combine
-    out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-    out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-    out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-    out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-    out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-    out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-    out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-    out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-  }
-  {
-    step1[16] = _mm_add_epi16(step3[17], step2[16]);
-    step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-    step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-    step1[19] = _mm_add_epi16(step3[18], step2[19]);
-    step1[20] = _mm_add_epi16(step3[21], step2[20]);
-    step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-    step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-    step1[23] = _mm_add_epi16(step3[22], step2[23]);
-    step1[24] = _mm_add_epi16(step3[25], step2[24]);
-    step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-    step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-    step1[27] = _mm_add_epi16(step3[26], step2[27]);
-    step1[28] = _mm_add_epi16(step3[29], step2[28]);
-    step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-    step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-    step1[31] = _mm_add_epi16(step3[30], step2[31]);
-  }
-  // Final stage --- outputs indices are bit-reversed.
-  {
-    const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-    const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-    const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-    const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-    const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-    const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-    const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-    const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-    const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-    const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-    const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-    const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-    const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-    const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-    const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-    const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-    const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-    const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-    const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-    const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-    const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-    const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-    const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-    const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-    // dct_const_round_shift
-    const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-    const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-    const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-    const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-    const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-    const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-    const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-    const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-    const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-    const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-    const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-    const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-    const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-    const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-    const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-    const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-    // Combine
-    out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-    out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-    out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-    out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-    out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-    out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-    out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-    out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-  }
-  {
-    const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-    const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-    const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-    const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-    const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-    const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-    const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-    const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-    const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-    const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-    const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-    const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-    const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-    const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-    const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-    const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-    const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-    const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-    const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-    const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-    const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-    const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-    const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-    const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-    // dct_const_round_shift
-    const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-    const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-    const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-    const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-    const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-    const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-    const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-    const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-    const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-    const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-    const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-    const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-    const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-    const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-    const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-    const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-    // Combine
-    out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-    out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-    out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-    out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-    out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-    out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-    out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-    out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-  }
-
-  // Output results
-  {
-    int j;
-    for (j = 0; j < 16; ++j) {
-      _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
-      _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
-    }
-  }
-}  // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
deleted file mode 100644
index 216739581..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ /dev/null
@@ -1,3022 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_intrin.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-#if FDCT32x32_HIGH_PRECISION
-static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
-  __m256i buf0, buf1;
-  buf0 = _mm256_mul_epu32(a, b);
-  a = _mm256_srli_epi64(a, 32);
-  b = _mm256_srli_epi64(b, 32);
-  buf1 = _mm256_mul_epu32(a, b);
-  return _mm256_add_epi64(buf0, buf1);
-}
-
-static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
-  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm256_unpacklo_epi64(buf0, buf1);
-}
-#endif
-
-#ifndef STORE_COEFF_FUNC
-#define STORE_COEFF_FUNC
-static void store_coeff(const __m256i *coeff, tran_low_t *curr,
-                        tran_low_t *next) {
-  __m128i u = _mm256_castsi256_si128(*coeff);
-  storeu_output(&u, curr);
-  u = _mm256_extractf128_si256(*coeff, 1);
-  storeu_output(&u, next);
-}
-#endif
-
-void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
-                       int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i k__cospi_p16_m16 =
-      pair256_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i k__cospi_m12_m20 =
-      pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
-  const __m256i kOne = _mm256_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-  for (pass = 0; pass < 2; ++pass) {
-    // We process sixteen columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 16) {
-      __m256i step1[32];
-      __m256i step2[32];
-      __m256i step3[32];
-      __m256i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m256i *step1a = &step1[0];
-          __m256i *step1b = &step1[31];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m256i *step1a = &step1[4];
-          __m256i *step1b = &step1[27];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m256i *step1a = &step1[8];
-          __m256i *step1b = &step1[23];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m256i *step1a = &step1[12];
-          __m256i *step1b = &step1[19];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
-          __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
-          __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
-          __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
-          __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
-          __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
-          __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
-          __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
-          step1[0] = _mm256_add_epi16(in00, in31);
-          step1[1] = _mm256_add_epi16(in01, in30);
-          step1[2] = _mm256_add_epi16(in02, in29);
-          step1[3] = _mm256_add_epi16(in03, in28);
-          step1[28] = _mm256_sub_epi16(in03, in28);
-          step1[29] = _mm256_sub_epi16(in02, in29);
-          step1[30] = _mm256_sub_epi16(in01, in30);
-          step1[31] = _mm256_sub_epi16(in00, in31);
-        }
-        {
-          __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
-          __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
-          __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
-          __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
-          __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
-          __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
-          __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
-          __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
-          step1[4] = _mm256_add_epi16(in04, in27);
-          step1[5] = _mm256_add_epi16(in05, in26);
-          step1[6] = _mm256_add_epi16(in06, in25);
-          step1[7] = _mm256_add_epi16(in07, in24);
-          step1[24] = _mm256_sub_epi16(in07, in24);
-          step1[25] = _mm256_sub_epi16(in06, in25);
-          step1[26] = _mm256_sub_epi16(in05, in26);
-          step1[27] = _mm256_sub_epi16(in04, in27);
-        }
-        {
-          __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
-          __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
-          __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
-          __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
-          __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
-          __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
-          __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
-          __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
-          step1[8] = _mm256_add_epi16(in08, in23);
-          step1[9] = _mm256_add_epi16(in09, in22);
-          step1[10] = _mm256_add_epi16(in10, in21);
-          step1[11] = _mm256_add_epi16(in11, in20);
-          step1[20] = _mm256_sub_epi16(in11, in20);
-          step1[21] = _mm256_sub_epi16(in10, in21);
-          step1[22] = _mm256_sub_epi16(in09, in22);
-          step1[23] = _mm256_sub_epi16(in08, in23);
-        }
-        {
-          __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
-          __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
-          __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
-          __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
-          __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
-          __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
-          __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
-          __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
-          step1[12] = _mm256_add_epi16(in12, in19);
-          step1[13] = _mm256_add_epi16(in13, in18);
-          step1[14] = _mm256_add_epi16(in14, in17);
-          step1[15] = _mm256_add_epi16(in15, in16);
-          step1[16] = _mm256_sub_epi16(in15, in16);
-          step1[17] = _mm256_sub_epi16(in14, in17);
-          step1[18] = _mm256_sub_epi16(in13, in18);
-          step1[19] = _mm256_sub_epi16(in12, in19);
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = _mm256_add_epi16(step1[0], step1[15]);
-        step2[1] = _mm256_add_epi16(step1[1], step1[14]);
-        step2[2] = _mm256_add_epi16(step1[2], step1[13]);
-        step2[3] = _mm256_add_epi16(step1[3], step1[12]);
-        step2[4] = _mm256_add_epi16(step1[4], step1[11]);
-        step2[5] = _mm256_add_epi16(step1[5], step1[10]);
-        step2[6] = _mm256_add_epi16(step1[6], step1[9]);
-        step2[7] = _mm256_add_epi16(step1[7], step1[8]);
-        step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
-        step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
-        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
-        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
-        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
-        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
-        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
-        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
-      }
-      {
-        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
-        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
-        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
-        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
-        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
-        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
-        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
-        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
-        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m256i s2_20_4 =
-            _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_5 =
-            _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_4 =
-            _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_5 =
-            _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_4 =
-            _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_5 =
-            _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_4 =
-            _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_5 =
-            _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_4 =
-            _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_5 =
-            _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_4 =
-            _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_5 =
-            _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_4 =
-            _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_5 =
-            _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_4 =
-            _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_5 =
-            _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
-        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
-        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
-        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
-        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
-        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
-        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
-        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
-        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
-        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
-        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
-        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
-        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
-        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
-        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
-        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
-        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
-        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
-        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
-        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
-        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
-        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
-        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
-        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
-        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
-        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
-        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
-        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
-        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
-        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
-        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
-        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
-
-        step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
-        step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
-        step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
-        step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
-        step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
-        step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
-        step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
-        step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
-        step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
-        step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
-        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
-        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
-        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
-        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
-        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
-        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
-        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
-        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
-        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
-        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
-        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
-        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
-        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
-        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
-        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
-        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
-        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
-        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
-        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
-        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
-        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
-
-        step2[0] = _mm256_add_epi16(step2[0], kOne);
-        step2[1] = _mm256_add_epi16(step2[1], kOne);
-        step2[2] = _mm256_add_epi16(step2[2], kOne);
-        step2[3] = _mm256_add_epi16(step2[3], kOne);
-        step2[4] = _mm256_add_epi16(step2[4], kOne);
-        step2[5] = _mm256_add_epi16(step2[5], kOne);
-        step2[6] = _mm256_add_epi16(step2[6], kOne);
-        step2[7] = _mm256_add_epi16(step2[7], kOne);
-        step2[8] = _mm256_add_epi16(step2[8], kOne);
-        step2[9] = _mm256_add_epi16(step2[9], kOne);
-        step2[10] = _mm256_add_epi16(step2[10], kOne);
-        step2[11] = _mm256_add_epi16(step2[11], kOne);
-        step2[12] = _mm256_add_epi16(step2[12], kOne);
-        step2[13] = _mm256_add_epi16(step2[13], kOne);
-        step2[14] = _mm256_add_epi16(step2[14], kOne);
-        step2[15] = _mm256_add_epi16(step2[15], kOne);
-        step1[16] = _mm256_add_epi16(step1[16], kOne);
-        step1[17] = _mm256_add_epi16(step1[17], kOne);
-        step1[18] = _mm256_add_epi16(step1[18], kOne);
-        step1[19] = _mm256_add_epi16(step1[19], kOne);
-        step2[20] = _mm256_add_epi16(step2[20], kOne);
-        step2[21] = _mm256_add_epi16(step2[21], kOne);
-        step2[22] = _mm256_add_epi16(step2[22], kOne);
-        step2[23] = _mm256_add_epi16(step2[23], kOne);
-        step2[24] = _mm256_add_epi16(step2[24], kOne);
-        step2[25] = _mm256_add_epi16(step2[25], kOne);
-        step2[26] = _mm256_add_epi16(step2[26], kOne);
-        step2[27] = _mm256_add_epi16(step2[27], kOne);
-        step1[28] = _mm256_add_epi16(step1[28], kOne);
-        step1[29] = _mm256_add_epi16(step1[29], kOne);
-        step1[30] = _mm256_add_epi16(step1[30], kOne);
-        step1[31] = _mm256_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm256_srai_epi16(step2[0], 2);
-        step2[1] = _mm256_srai_epi16(step2[1], 2);
-        step2[2] = _mm256_srai_epi16(step2[2], 2);
-        step2[3] = _mm256_srai_epi16(step2[3], 2);
-        step2[4] = _mm256_srai_epi16(step2[4], 2);
-        step2[5] = _mm256_srai_epi16(step2[5], 2);
-        step2[6] = _mm256_srai_epi16(step2[6], 2);
-        step2[7] = _mm256_srai_epi16(step2[7], 2);
-        step2[8] = _mm256_srai_epi16(step2[8], 2);
-        step2[9] = _mm256_srai_epi16(step2[9], 2);
-        step2[10] = _mm256_srai_epi16(step2[10], 2);
-        step2[11] = _mm256_srai_epi16(step2[11], 2);
-        step2[12] = _mm256_srai_epi16(step2[12], 2);
-        step2[13] = _mm256_srai_epi16(step2[13], 2);
-        step2[14] = _mm256_srai_epi16(step2[14], 2);
-        step2[15] = _mm256_srai_epi16(step2[15], 2);
-        step1[16] = _mm256_srai_epi16(step1[16], 2);
-        step1[17] = _mm256_srai_epi16(step1[17], 2);
-        step1[18] = _mm256_srai_epi16(step1[18], 2);
-        step1[19] = _mm256_srai_epi16(step1[19], 2);
-        step2[20] = _mm256_srai_epi16(step2[20], 2);
-        step2[21] = _mm256_srai_epi16(step2[21], 2);
-        step2[22] = _mm256_srai_epi16(step2[22], 2);
-        step2[23] = _mm256_srai_epi16(step2[23], 2);
-        step2[24] = _mm256_srai_epi16(step2[24], 2);
-        step2[25] = _mm256_srai_epi16(step2[25], 2);
-        step2[26] = _mm256_srai_epi16(step2[26], 2);
-        step2[27] = _mm256_srai_epi16(step2[27], 2);
-        step1[28] = _mm256_srai_epi16(step1[28], 2);
-        step1[29] = _mm256_srai_epi16(step1[29], 2);
-        step1[30] = _mm256_srai_epi16(step1[30], 2);
-        step1[31] = _mm256_srai_epi16(step1[31], 2);
-      }
-#endif
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
-          step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
-          step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
-          step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
-          step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
-          step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
-          step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
-          step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
-        }
-        {
-          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s3_10_4 =
-              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 =
-              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 =
-              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 =
-              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 =
-              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 =
-              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 =
-              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 =
-              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
-        }
-        {
-          step3[16] = _mm256_add_epi16(step2[23], step1[16]);
-          step3[17] = _mm256_add_epi16(step2[22], step1[17]);
-          step3[18] = _mm256_add_epi16(step2[21], step1[18]);
-          step3[19] = _mm256_add_epi16(step2[20], step1[19]);
-          step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
-          step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
-          step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
-          step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
-          step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
-          step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
-          step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
-          step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
-          step3[28] = _mm256_add_epi16(step2[27], step1[28]);
-          step3[29] = _mm256_add_epi16(step2[26], step1[29]);
-          step3[30] = _mm256_add_epi16(step2[25], step1[30]);
-          step3[31] = _mm256_add_epi16(step2[24], step1[31]);
-        }
-
-        // Stage 4
-        {
-          step1[0] = _mm256_add_epi16(step3[3], step3[0]);
-          step1[1] = _mm256_add_epi16(step3[2], step3[1]);
-          step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
-          step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
-          step1[8] = _mm256_add_epi16(step3[11], step2[8]);
-          step1[9] = _mm256_add_epi16(step3[10], step2[9]);
-          step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
-          step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
-          step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
-          step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
-          step1[14] = _mm256_add_epi16(step3[13], step2[14]);
-          step1[15] = _mm256_add_epi16(step3[12], step2[15]);
-        }
-        {
-          const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
-          const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
-          const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s1_05_4 =
-              _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_05_5 =
-              _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_06_4 =
-              _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_06_5 =
-              _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
-        }
-        {
-          const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
-          const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
-          const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
-          const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
-          const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
-          const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
-          const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
-          const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
-          const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m256i s1_18_4 =
-              _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_18_5 =
-              _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_19_4 =
-              _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_19_5 =
-              _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_20_4 =
-              _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_20_5 =
-              _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_21_4 =
-              _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_21_5 =
-              _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_26_4 =
-              _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_26_5 =
-              _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_27_4 =
-              _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_27_5 =
-              _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_28_4 =
-              _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_28_5 =
-              _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_29_4 =
-              _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_29_5 =
-              _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
-        }
-        // Stage 5
-        {
-          step2[4] = _mm256_add_epi16(step1[5], step3[4]);
-          step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
-          step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
-          step2[7] = _mm256_add_epi16(step1[6], step3[7]);
-        }
-        {
-          const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
-          const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
-          const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
-          const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
-          const __m256i out_00_2 =
-              _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m256i out_00_3 =
-              _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m256i out_16_2 =
-              _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m256i out_16_3 =
-              _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m256i out_08_2 =
-              _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m256i out_08_3 =
-              _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m256i out_24_2 =
-              _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m256i out_24_3 =
-              _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m256i out_00_4 =
-              _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_00_5 =
-              _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_16_4 =
-              _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_16_5 =
-              _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_08_4 =
-              _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_08_5 =
-              _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_24_4 =
-              _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_24_5 =
-              _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
-        }
-        {
-          const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
-          const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
-          const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
-          const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
-          const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m256i s2_09_4 =
-              _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_09_5 =
-              _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_10_4 =
-              _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_10_5 =
-              _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_13_4 =
-              _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_13_5 =
-              _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_14_4 =
-              _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_14_5 =
-              _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
-        }
-        {
-          step2[16] = _mm256_add_epi16(step1[19], step3[16]);
-          step2[17] = _mm256_add_epi16(step1[18], step3[17]);
-          step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
-          step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
-          step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
-          step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
-          step2[22] = _mm256_add_epi16(step1[21], step3[22]);
-          step2[23] = _mm256_add_epi16(step1[20], step3[23]);
-          step2[24] = _mm256_add_epi16(step1[27], step3[24]);
-          step2[25] = _mm256_add_epi16(step1[26], step3[25]);
-          step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
-          step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
-          step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
-          step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
-          step2[30] = _mm256_add_epi16(step1[29], step3[30]);
-          step2[31] = _mm256_add_epi16(step1[28], step3[31]);
-        }
-        // Stage 6
-        {
-          const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-          const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-          const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-          const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-          const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-          const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-          const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-          const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-          const __m256i out_04_2 =
-              _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m256i out_04_3 =
-              _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m256i out_20_2 =
-              _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m256i out_20_3 =
-              _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m256i out_12_2 =
-              _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m256i out_12_3 =
-              _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m256i out_28_2 =
-              _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m256i out_28_3 =
-              _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m256i out_04_4 =
-              _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_04_5 =
-              _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_20_4 =
-              _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_20_5 =
-              _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_12_4 =
-              _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_12_5 =
-              _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_28_4 =
-              _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_28_5 =
-              _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
-        }
-        {
-          step3[8] = _mm256_add_epi16(step2[9], step1[8]);
-          step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
-          step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
-          step3[11] = _mm256_add_epi16(step2[10], step1[11]);
-          step3[12] = _mm256_add_epi16(step2[13], step1[12]);
-          step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
-          step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
-          step3[15] = _mm256_add_epi16(step2[14], step1[15]);
-        }
-        {
-          const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
-          const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
-          const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
-          const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
-          const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
-          const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
-          const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
-          const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
-          const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m256i s3_17_4 =
-              _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_17_5 =
-              _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_18_4 =
-              _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_18_5 =
-              _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_21_4 =
-              _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_21_5 =
-              _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_22_4 =
-              _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_22_5 =
-              _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m256i s3_25_4 =
-              _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_25_5 =
-              _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_26_4 =
-              _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_26_5 =
-              _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_29_4 =
-              _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_29_5 =
-              _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_30_4 =
-              _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_30_5 =
-              _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
-        }
-        // Stage 7
-        {
-          const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
-          const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
-          const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
-          const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
-          const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
-          const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
-          const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
-          const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
-          const __m256i out_02_2 =
-              _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m256i out_02_3 =
-              _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m256i out_18_2 =
-              _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m256i out_18_3 =
-              _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m256i out_10_2 =
-              _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m256i out_10_3 =
-              _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m256i out_26_2 =
-              _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m256i out_26_3 =
-              _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m256i out_06_2 =
-              _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m256i out_06_3 =
-              _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m256i out_22_2 =
-              _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m256i out_22_3 =
-              _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m256i out_14_2 =
-              _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m256i out_14_3 =
-              _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m256i out_30_2 =
-              _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m256i out_30_3 =
-              _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m256i out_02_4 =
-              _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_02_5 =
-              _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_18_4 =
-              _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_18_5 =
-              _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_10_4 =
-              _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_10_5 =
-              _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_26_4 =
-              _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_26_5 =
-              _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_06_4 =
-              _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_06_5 =
-              _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_22_4 =
-              _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_22_5 =
-              _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_14_4 =
-              _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_14_5 =
-              _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_30_4 =
-              _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_30_5 =
-              _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
-        }
-        {
-          step1[16] = _mm256_add_epi16(step3[17], step2[16]);
-          step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
-          step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
-          step1[19] = _mm256_add_epi16(step3[18], step2[19]);
-          step1[20] = _mm256_add_epi16(step3[21], step2[20]);
-          step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
-          step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
-          step1[23] = _mm256_add_epi16(step3[22], step2[23]);
-          step1[24] = _mm256_add_epi16(step3[25], step2[24]);
-          step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
-          step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
-          step1[27] = _mm256_add_epi16(step3[26], step2[27]);
-          step1[28] = _mm256_add_epi16(step3[29], step2[28]);
-          step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
-          step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
-          step1[31] = _mm256_add_epi16(step3[30], step2[31]);
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
-          const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
-          const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
-          const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
-          const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
-          const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
-          const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
-          const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
-          const __m256i out_01_2 =
-              _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m256i out_01_3 =
-              _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m256i out_17_2 =
-              _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m256i out_17_3 =
-              _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m256i out_09_2 =
-              _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m256i out_09_3 =
-              _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m256i out_25_2 =
-              _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m256i out_25_3 =
-              _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m256i out_07_2 =
-              _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m256i out_07_3 =
-              _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m256i out_23_2 =
-              _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m256i out_23_3 =
-              _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m256i out_15_2 =
-              _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m256i out_15_3 =
-              _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m256i out_31_2 =
-              _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m256i out_31_3 =
-              _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m256i out_01_4 =
-              _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_01_5 =
-              _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_17_4 =
-              _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_17_5 =
-              _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_09_4 =
-              _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_09_5 =
-              _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_25_4 =
-              _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_25_5 =
-              _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_07_4 =
-              _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_07_5 =
-              _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_23_4 =
-              _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_23_5 =
-              _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_15_4 =
-              _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_15_5 =
-              _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_31_4 =
-              _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_31_5 =
-              _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
-        }
-        {
-          const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
-          const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
-          const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
-          const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
-          const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
-          const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
-          const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
-          const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
-          const __m256i out_05_2 =
-              _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m256i out_05_3 =
-              _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m256i out_21_2 =
-              _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m256i out_21_3 =
-              _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m256i out_13_2 =
-              _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m256i out_13_3 =
-              _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m256i out_29_2 =
-              _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m256i out_29_3 =
-              _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m256i out_03_2 =
-              _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m256i out_03_3 =
-              _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m256i out_19_2 =
-              _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m256i out_19_3 =
-              _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m256i out_11_2 =
-              _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m256i out_11_3 =
-              _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m256i out_27_2 =
-              _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m256i out_27_3 =
-              _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m256i out_05_4 =
-              _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_05_5 =
-              _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_21_4 =
-              _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_21_5 =
-              _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_13_4 =
-              _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_13_5 =
-              _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_29_4 =
-              _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_29_5 =
-              _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_03_4 =
-              _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_03_5 =
-              _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_19_4 =
-              _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_19_5 =
-              _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_11_4 =
-              _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_11_5 =
-              _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_27_4 =
-              _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_27_5 =
-              _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m256i lstep1[64], lstep2[64], lstep3[64];
-        __m256i u[32], v[32], sign[16];
-        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s3_10_4 =
-              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 =
-              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 =
-              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 =
-              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 =
-              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 =
-              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 =
-              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 =
-              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m256i k32_p16_p16 =
-              pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 =
-              pair256_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 =
-              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m256i k32_p16_p16 =
-              pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 =
-              pair256_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-
-          u[0] = _mm256_sub_epi32(u[0], sign[0]);
-          u[1] = _mm256_sub_epi32(u[1], sign[1]);
-          u[2] = _mm256_sub_epi32(u[2], sign[2]);
-          u[3] = _mm256_sub_epi32(u[3], sign[3]);
-          u[4] = _mm256_sub_epi32(u[4], sign[4]);
-          u[5] = _mm256_sub_epi32(u[5], sign[5]);
-          u[6] = _mm256_sub_epi32(u[6], sign[6]);
-          u[7] = _mm256_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm256_add_epi32(u[0], K32One);
-          u[1] = _mm256_add_epi32(u[1], K32One);
-          u[2] = _mm256_add_epi32(u[2], K32One);
-          u[3] = _mm256_add_epi32(u[3], K32One);
-          u[4] = _mm256_add_epi32(u[4], K32One);
-          u[5] = _mm256_add_epi32(u[5], K32One);
-          u[6] = _mm256_add_epi32(u[6], K32One);
-          u[7] = _mm256_add_epi32(u[7], K32One);
-
-          u[0] = _mm256_srai_epi32(u[0], 2);
-          u[1] = _mm256_srai_epi32(u[1], 2);
-          u[2] = _mm256_srai_epi32(u[2], 2);
-          u[3] = _mm256_srai_epi32(u[3], 2);
-          u[4] = _mm256_srai_epi32(u[4], 2);
-          u[5] = _mm256_srai_epi32(u[5], 2);
-          u[6] = _mm256_srai_epi32(u[6], 2);
-          u[7] = _mm256_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm256_packs_epi32(u[0], u[1]);
-          out[16] = _mm256_packs_epi32(u[2], u[3]);
-          out[8] = _mm256_packs_epi32(u[4], u[5]);
-          out[24] = _mm256_packs_epi32(u[6], u[7]);
-        }
-        {
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 =
-              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m256i k32_p28_p04 =
-              pair256_set_epi32(cospi_28_64, cospi_4_64);
-          const __m256i k32_p12_p20 =
-              pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_m20_p12 =
-              pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m04_p28 =
-              pair256_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-
-          u[0] = _mm256_sub_epi32(u[0], sign[0]);
-          u[1] = _mm256_sub_epi32(u[1], sign[1]);
-          u[2] = _mm256_sub_epi32(u[2], sign[2]);
-          u[3] = _mm256_sub_epi32(u[3], sign[3]);
-          u[4] = _mm256_sub_epi32(u[4], sign[4]);
-          u[5] = _mm256_sub_epi32(u[5], sign[5]);
-          u[6] = _mm256_sub_epi32(u[6], sign[6]);
-          u[7] = _mm256_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm256_add_epi32(u[0], K32One);
-          u[1] = _mm256_add_epi32(u[1], K32One);
-          u[2] = _mm256_add_epi32(u[2], K32One);
-          u[3] = _mm256_add_epi32(u[3], K32One);
-          u[4] = _mm256_add_epi32(u[4], K32One);
-          u[5] = _mm256_add_epi32(u[5], K32One);
-          u[6] = _mm256_add_epi32(u[6], K32One);
-          u[7] = _mm256_add_epi32(u[7], K32One);
-
-          u[0] = _mm256_srai_epi32(u[0], 2);
-          u[1] = _mm256_srai_epi32(u[1], 2);
-          u[2] = _mm256_srai_epi32(u[2], 2);
-          u[3] = _mm256_srai_epi32(u[3], 2);
-          u[4] = _mm256_srai_epi32(u[4], 2);
-          u[5] = _mm256_srai_epi32(u[5], 2);
-          u[6] = _mm256_srai_epi32(u[6], 2);
-          u[7] = _mm256_srai_epi32(u[7], 2);
-
-          out[4] = _mm256_packs_epi32(u[0], u[1]);
-          out[20] = _mm256_packs_epi32(u[2], u[3]);
-          out[12] = _mm256_packs_epi32(u[4], u[5]);
-          out[28] = _mm256_packs_epi32(u[6], u[7]);
-        }
-        {
-          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m256i k32_m04_p28 =
-              pair256_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m256i k32_m28_m04 =
-              pair256_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m256i k32_m20_p12 =
-              pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m12_m20 =
-              pair256_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m256i k32_p12_p20 =
-              pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_p28_p04 =
-              pair256_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m256i k32_p30_p02 =
-              pair256_set_epi32(cospi_30_64, cospi_2_64);
-          const __m256i k32_p14_p18 =
-              pair256_set_epi32(cospi_14_64, cospi_18_64);
-          const __m256i k32_p22_p10 =
-              pair256_set_epi32(cospi_22_64, cospi_10_64);
-          const __m256i k32_p06_p26 =
-              pair256_set_epi32(cospi_6_64, cospi_26_64);
-          const __m256i k32_m26_p06 =
-              pair256_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m256i k32_m10_p22 =
-              pair256_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m256i k32_m18_p14 =
-              pair256_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m256i k32_m02_p30 =
-              pair256_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[2] = _mm256_packs_epi32(u[0], u[1]);
-          out[18] = _mm256_packs_epi32(u[2], u[3]);
-          out[10] = _mm256_packs_epi32(u[4], u[5]);
-          out[26] = _mm256_packs_epi32(u[6], u[7]);
-          out[6] = _mm256_packs_epi32(u[8], u[9]);
-          out[22] = _mm256_packs_epi32(u[10], u[11]);
-          out[14] = _mm256_packs_epi32(u[12], u[13]);
-          out[30] = _mm256_packs_epi32(u[14], u[15]);
-        }
-        {
-          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m256i k32_p31_p01 =
-              pair256_set_epi32(cospi_31_64, cospi_1_64);
-          const __m256i k32_p15_p17 =
-              pair256_set_epi32(cospi_15_64, cospi_17_64);
-          const __m256i k32_p23_p09 =
-              pair256_set_epi32(cospi_23_64, cospi_9_64);
-          const __m256i k32_p07_p25 =
-              pair256_set_epi32(cospi_7_64, cospi_25_64);
-          const __m256i k32_m25_p07 =
-              pair256_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m256i k32_m09_p23 =
-              pair256_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m256i k32_m17_p15 =
-              pair256_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m256i k32_m01_p31 =
-              pair256_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[1] = _mm256_packs_epi32(u[0], u[1]);
-          out[17] = _mm256_packs_epi32(u[2], u[3]);
-          out[9] = _mm256_packs_epi32(u[4], u[5]);
-          out[25] = _mm256_packs_epi32(u[6], u[7]);
-          out[7] = _mm256_packs_epi32(u[8], u[9]);
-          out[23] = _mm256_packs_epi32(u[10], u[11]);
-          out[15] = _mm256_packs_epi32(u[12], u[13]);
-          out[31] = _mm256_packs_epi32(u[14], u[15]);
-        }
-        {
-          const __m256i k32_p27_p05 =
-              pair256_set_epi32(cospi_27_64, cospi_5_64);
-          const __m256i k32_p11_p21 =
-              pair256_set_epi32(cospi_11_64, cospi_21_64);
-          const __m256i k32_p19_p13 =
-              pair256_set_epi32(cospi_19_64, cospi_13_64);
-          const __m256i k32_p03_p29 =
-              pair256_set_epi32(cospi_3_64, cospi_29_64);
-          const __m256i k32_m29_p03 =
-              pair256_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m256i k32_m13_p19 =
-              pair256_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m256i k32_m21_p11 =
-              pair256_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m256i k32_m05_p27 =
-              pair256_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[5] = _mm256_packs_epi32(u[0], u[1]);
-          out[21] = _mm256_packs_epi32(u[2], u[3]);
-          out[13] = _mm256_packs_epi32(u[4], u[5]);
-          out[29] = _mm256_packs_epi32(u[6], u[7]);
-          out[3] = _mm256_packs_epi32(u[8], u[9]);
-          out[19] = _mm256_packs_epi32(u[10], u[11]);
-          out[11] = _mm256_packs_epi32(u[12], u[13]);
-          out[27] = _mm256_packs_epi32(u[14], u[15]);
-        }
-      }
-#endif
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output_currStep, *output_nextStep;
-        tran_low_t *curr_out, *next_out;
-        // Pass 0
-        output_currStep = &intermediate[column_start * 32];
-        output_nextStep = &intermediate[(column_start + 8) * 32];
-        // Pass 1
-        curr_out = &output_org[column_start * 32];
-        next_out = &output_org[(column_start + 8) * 32];
-
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m256i *this_out = &out[8 * transpose_block];
-          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
-          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
-          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
-          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
-          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
-          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
-          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
-          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
-          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
-          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
-          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
-          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
-          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
-          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
-          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
-          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
-
-          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
-          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
-          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
-          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
-          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
-          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
-          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
-          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
-          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
-          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
-          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
-          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
-          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
-          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
-          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
-          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
-          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
-          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
-          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
-          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
-          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
-          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
-          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
-            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
-            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
-            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
-            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
-            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
-            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
-            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/av1_dct.c
-            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
-          }
-          if (0 == pass) {
-            // Note: even though all these stores are aligned, using the aligned
-            //       intrinsic make the code slightly slower.
-            _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
-                             _mm256_castsi256_si128(tr2_0));
-            _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
-                             _mm256_castsi256_si128(tr2_1));
-            _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
-                             _mm256_castsi256_si128(tr2_2));
-            _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
-                             _mm256_castsi256_si128(tr2_3));
-            _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
-                             _mm256_castsi256_si128(tr2_4));
-            _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
-                             _mm256_castsi256_si128(tr2_5));
-            _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
-                             _mm256_castsi256_si128(tr2_6));
-            _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
-                             _mm256_castsi256_si128(tr2_7));
-
-            _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
-                             _mm256_extractf128_si256(tr2_0, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
-                             _mm256_extractf128_si256(tr2_1, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
-                             _mm256_extractf128_si256(tr2_2, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
-                             _mm256_extractf128_si256(tr2_3, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
-                             _mm256_extractf128_si256(tr2_4, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
-                             _mm256_extractf128_si256(tr2_5, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
-                             _mm256_extractf128_si256(tr2_6, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
-                             _mm256_extractf128_si256(tr2_7, 1));
-            // Process next 8x8
-            output_currStep += 8;
-            output_nextStep += 8;
-          }
-          if (1 == pass) {
-            store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
-            store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
-            store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
-            store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
-            store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
-            store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
-            store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
-            store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
-            curr_out += 8;
-            next_out += 8;
-          }
-        }
-      }
-    }
-  }
-  _mm256_zeroupper();
-}  // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 69dd6af11..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ /dev/null
@@ -1,3201 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c
-#else
-void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    aom_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c
-#endif  // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif  // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 8) {
-      __m128i step1[32];
-      __m128i step2[32];
-      __m128i step3[32];
-      __m128i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m128i *step1a = &step1[0];
-          __m128i *step1b = &step1[31];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m128i *step1a = &step1[4];
-          __m128i *step1b = &step1[27];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m128i *step1a = &step1[8];
-          __m128i *step1b = &step1[23];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m128i *step1a = &step1[12];
-          __m128i *step1b = &step1[19];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
-          __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
-          __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
-          __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
-          __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[0] = ADD_EPI16(in00, in31);
-          step1[1] = ADD_EPI16(in01, in30);
-          step1[2] = ADD_EPI16(in02, in29);
-          step1[3] = ADD_EPI16(in03, in28);
-          step1[28] = SUB_EPI16(in03, in28);
-          step1[29] = SUB_EPI16(in02, in29);
-          step1[30] = SUB_EPI16(in01, in30);
-          step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
-                                             &step1[3], &step1[28], &step1[29],
-                                             &step1[30], &step1[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
-          __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
-          __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
-          __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
-          __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[4] = ADD_EPI16(in04, in27);
-          step1[5] = ADD_EPI16(in05, in26);
-          step1[6] = ADD_EPI16(in06, in25);
-          step1[7] = ADD_EPI16(in07, in24);
-          step1[24] = SUB_EPI16(in07, in24);
-          step1[25] = SUB_EPI16(in06, in25);
-          step1[26] = SUB_EPI16(in05, in26);
-          step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
-                                             &step1[7], &step1[24], &step1[25],
-                                             &step1[26], &step1[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
-          __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
-          __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[8] = ADD_EPI16(in08, in23);
-          step1[9] = ADD_EPI16(in09, in22);
-          step1[10] = ADD_EPI16(in10, in21);
-          step1[11] = ADD_EPI16(in11, in20);
-          step1[20] = SUB_EPI16(in11, in20);
-          step1[21] = SUB_EPI16(in10, in21);
-          step1[22] = SUB_EPI16(in09, in22);
-          step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
-                                             &step1[11], &step1[20], &step1[21],
-                                             &step1[22], &step1[23]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = ADD_EPI16(in12, in19);
-          step1[13] = ADD_EPI16(in13, in18);
-          step1[14] = ADD_EPI16(in14, in17);
-          step1[15] = ADD_EPI16(in15, in16);
-          step1[16] = SUB_EPI16(in15, in16);
-          step1[17] = SUB_EPI16(in14, in17);
-          step1[18] = SUB_EPI16(in13, in18);
-          step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
-                                             &step1[15], &step1[16], &step1[17],
-                                             &step1[18], &step1[19]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = ADD_EPI16(step1[0], step1[15]);
-        step2[1] = ADD_EPI16(step1[1], step1[14]);
-        step2[2] = ADD_EPI16(step1[2], step1[13]);
-        step2[3] = ADD_EPI16(step1[3], step1[12]);
-        step2[4] = ADD_EPI16(step1[4], step1[11]);
-        step2[5] = ADD_EPI16(step1[5], step1[10]);
-        step2[6] = ADD_EPI16(step1[6], step1[9]);
-        step2[7] = ADD_EPI16(step1[7], step1[8]);
-        step2[8] = SUB_EPI16(step1[7], step1[8]);
-        step2[9] = SUB_EPI16(step1[6], step1[9]);
-        step2[10] = SUB_EPI16(step1[5], step1[10]);
-        step2[11] = SUB_EPI16(step1[4], step1[11]);
-        step2[12] = SUB_EPI16(step1[3], step1[12]);
-        step2[13] = SUB_EPI16(step1[2], step1[13]);
-        step2[14] = SUB_EPI16(step1[1], step1[14]);
-        step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
-                                           &step2[23], &step2[24], &step2[25],
-                                           &step2[26], &step2[27]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
-        step2[0] = SUB_EPI16(step2[0], s3_00_0);
-        step2[1] = SUB_EPI16(step2[1], s3_01_0);
-        step2[2] = SUB_EPI16(step2[2], s3_02_0);
-        step2[3] = SUB_EPI16(step2[3], s3_03_0);
-        step2[4] = SUB_EPI16(step2[4], s3_04_0);
-        step2[5] = SUB_EPI16(step2[5], s3_05_0);
-        step2[6] = SUB_EPI16(step2[6], s3_06_0);
-        step2[7] = SUB_EPI16(step2[7], s3_07_0);
-        step2[8] = SUB_EPI16(step2[8], s2_08_0);
-        step2[9] = SUB_EPI16(step2[9], s2_09_0);
-        step2[10] = SUB_EPI16(step2[10], s3_10_0);
-        step2[11] = SUB_EPI16(step2[11], s3_11_0);
-        step2[12] = SUB_EPI16(step2[12], s3_12_0);
-        step2[13] = SUB_EPI16(step2[13], s3_13_0);
-        step2[14] = SUB_EPI16(step2[14], s2_14_0);
-        step2[15] = SUB_EPI16(step2[15], s2_15_0);
-        step1[16] = SUB_EPI16(step1[16], s3_16_0);
-        step1[17] = SUB_EPI16(step1[17], s3_17_0);
-        step1[18] = SUB_EPI16(step1[18], s3_18_0);
-        step1[19] = SUB_EPI16(step1[19], s3_19_0);
-        step2[20] = SUB_EPI16(step2[20], s3_20_0);
-        step2[21] = SUB_EPI16(step2[21], s3_21_0);
-        step2[22] = SUB_EPI16(step2[22], s3_22_0);
-        step2[23] = SUB_EPI16(step2[23], s3_23_0);
-        step2[24] = SUB_EPI16(step2[24], s3_24_0);
-        step2[25] = SUB_EPI16(step2[25], s3_25_0);
-        step2[26] = SUB_EPI16(step2[26], s3_26_0);
-        step2[27] = SUB_EPI16(step2[27], s3_27_0);
-        step1[28] = SUB_EPI16(step1[28], s3_28_0);
-        step1[29] = SUB_EPI16(step1[29], s3_29_0);
-        step1[30] = SUB_EPI16(step1[30], s3_30_0);
-        step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x32(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
-            &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
-            &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
-            &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
-        if (overflow) {
-          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        step2[0] = _mm_add_epi16(step2[0], kOne);
-        step2[1] = _mm_add_epi16(step2[1], kOne);
-        step2[2] = _mm_add_epi16(step2[2], kOne);
-        step2[3] = _mm_add_epi16(step2[3], kOne);
-        step2[4] = _mm_add_epi16(step2[4], kOne);
-        step2[5] = _mm_add_epi16(step2[5], kOne);
-        step2[6] = _mm_add_epi16(step2[6], kOne);
-        step2[7] = _mm_add_epi16(step2[7], kOne);
-        step2[8] = _mm_add_epi16(step2[8], kOne);
-        step2[9] = _mm_add_epi16(step2[9], kOne);
-        step2[10] = _mm_add_epi16(step2[10], kOne);
-        step2[11] = _mm_add_epi16(step2[11], kOne);
-        step2[12] = _mm_add_epi16(step2[12], kOne);
-        step2[13] = _mm_add_epi16(step2[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step1[16] = _mm_add_epi16(step1[16], kOne);
-        step1[17] = _mm_add_epi16(step1[17], kOne);
-        step1[18] = _mm_add_epi16(step1[18], kOne);
-        step1[19] = _mm_add_epi16(step1[19], kOne);
-        step2[20] = _mm_add_epi16(step2[20], kOne);
-        step2[21] = _mm_add_epi16(step2[21], kOne);
-        step2[22] = _mm_add_epi16(step2[22], kOne);
-        step2[23] = _mm_add_epi16(step2[23], kOne);
-        step2[24] = _mm_add_epi16(step2[24], kOne);
-        step2[25] = _mm_add_epi16(step2[25], kOne);
-        step2[26] = _mm_add_epi16(step2[26], kOne);
-        step2[27] = _mm_add_epi16(step2[27], kOne);
-        step1[28] = _mm_add_epi16(step1[28], kOne);
-        step1[29] = _mm_add_epi16(step1[29], kOne);
-        step1[30] = _mm_add_epi16(step1[30], kOne);
-        step1[31] = _mm_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm_srai_epi16(step2[0], 2);
-        step2[1] = _mm_srai_epi16(step2[1], 2);
-        step2[2] = _mm_srai_epi16(step2[2], 2);
-        step2[3] = _mm_srai_epi16(step2[3], 2);
-        step2[4] = _mm_srai_epi16(step2[4], 2);
-        step2[5] = _mm_srai_epi16(step2[5], 2);
-        step2[6] = _mm_srai_epi16(step2[6], 2);
-        step2[7] = _mm_srai_epi16(step2[7], 2);
-        step2[8] = _mm_srai_epi16(step2[8], 2);
-        step2[9] = _mm_srai_epi16(step2[9], 2);
-        step2[10] = _mm_srai_epi16(step2[10], 2);
-        step2[11] = _mm_srai_epi16(step2[11], 2);
-        step2[12] = _mm_srai_epi16(step2[12], 2);
-        step2[13] = _mm_srai_epi16(step2[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step1[16] = _mm_srai_epi16(step1[16], 2);
-        step1[17] = _mm_srai_epi16(step1[17], 2);
-        step1[18] = _mm_srai_epi16(step1[18], 2);
-        step1[19] = _mm_srai_epi16(step1[19], 2);
-        step2[20] = _mm_srai_epi16(step2[20], 2);
-        step2[21] = _mm_srai_epi16(step2[21], 2);
-        step2[22] = _mm_srai_epi16(step2[22], 2);
-        step2[23] = _mm_srai_epi16(step2[23], 2);
-        step2[24] = _mm_srai_epi16(step2[24], 2);
-        step2[25] = _mm_srai_epi16(step2[25], 2);
-        step2[26] = _mm_srai_epi16(step2[26], 2);
-        step2[27] = _mm_srai_epi16(step2[27], 2);
-        step1[28] = _mm_srai_epi16(step1[28], 2);
-        step1[29] = _mm_srai_epi16(step1[29], 2);
-        step1[30] = _mm_srai_epi16(step1[30], 2);
-        step1[31] = _mm_srai_epi16(step1[31], 2);
-      }
-#endif  // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
-          step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
-          step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
-          step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
-          step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
-          step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
-          step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
-          step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
-                                             &step3[3], &step3[4], &step3[5],
-                                             &step3[6], &step3[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
-                                             &step3[13]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[16] = ADD_EPI16(step2[23], step1[16]);
-          step3[17] = ADD_EPI16(step2[22], step1[17]);
-          step3[18] = ADD_EPI16(step2[21], step1[18]);
-          step3[19] = ADD_EPI16(step2[20], step1[19]);
-          step3[20] = SUB_EPI16(step1[19], step2[20]);
-          step3[21] = SUB_EPI16(step1[18], step2[21]);
-          step3[22] = SUB_EPI16(step1[17], step2[22]);
-          step3[23] = SUB_EPI16(step1[16], step2[23]);
-          step3[24] = SUB_EPI16(step1[31], step2[24]);
-          step3[25] = SUB_EPI16(step1[30], step2[25]);
-          step3[26] = SUB_EPI16(step1[29], step2[26]);
-          step3[27] = SUB_EPI16(step1[28], step2[27]);
-          step3[28] = ADD_EPI16(step2[27], step1[28]);
-          step3[29] = ADD_EPI16(step2[26], step1[29]);
-          step3[30] = ADD_EPI16(step2[25], step1[30]);
-          step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
-              &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
-              &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
-              &step3[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-
-        // Stage 4
-        {
-          step1[0] = ADD_EPI16(step3[3], step3[0]);
-          step1[1] = ADD_EPI16(step3[2], step3[1]);
-          step1[2] = SUB_EPI16(step3[1], step3[2]);
-          step1[3] = SUB_EPI16(step3[0], step3[3]);
-          step1[8] = ADD_EPI16(step3[11], step2[8]);
-          step1[9] = ADD_EPI16(step3[10], step2[9]);
-          step1[10] = SUB_EPI16(step2[9], step3[10]);
-          step1[11] = SUB_EPI16(step2[8], step3[11]);
-          step1[12] = SUB_EPI16(step2[15], step3[12]);
-          step1[13] = SUB_EPI16(step2[14], step3[13]);
-          step1[14] = ADD_EPI16(step3[13], step2[14]);
-          step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
-              &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
-              &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
-                                             &step1[21], &step1[26], &step1[27],
-                                             &step1[28], &step1[29]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 5
-        {
-          step2[4] = ADD_EPI16(step1[5], step3[4]);
-          step2[5] = SUB_EPI16(step3[4], step1[5]);
-          step2[6] = SUB_EPI16(step3[7], step1[6]);
-          step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
-                                             &step2[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-          const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-          const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-          const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-          const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i out_00_4 =
-              _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_5 =
-              _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_4 =
-              _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_5 =
-              _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_4 =
-              _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_5 =
-              _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_4 =
-              _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_5 =
-              _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-          const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-          const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-          const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-          const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
-                                             &step2[14]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step2[16] = ADD_EPI16(step1[19], step3[16]);
-          step2[17] = ADD_EPI16(step1[18], step3[17]);
-          step2[18] = SUB_EPI16(step3[17], step1[18]);
-          step2[19] = SUB_EPI16(step3[16], step1[19]);
-          step2[20] = SUB_EPI16(step3[23], step1[20]);
-          step2[21] = SUB_EPI16(step3[22], step1[21]);
-          step2[22] = ADD_EPI16(step1[21], step3[22]);
-          step2[23] = ADD_EPI16(step1[20], step3[23]);
-          step2[24] = ADD_EPI16(step1[27], step3[24]);
-          step2[25] = ADD_EPI16(step1[26], step3[25]);
-          step2[26] = SUB_EPI16(step3[25], step1[26]);
-          step2[27] = SUB_EPI16(step3[24], step1[27]);
-          step2[28] = SUB_EPI16(step3[31], step1[28]);
-          step2[29] = SUB_EPI16(step3[30], step1[29]);
-          step2[30] = ADD_EPI16(step1[29], step3[30]);
-          step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
-              &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
-              &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
-              &step2[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 6
-        {
-          const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m128i out_04_4 =
-              _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_5 =
-              _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_4 =
-              _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_5 =
-              _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_4 =
-              _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_5 =
-              _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_4 =
-              _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_5 =
-              _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[8] = ADD_EPI16(step2[9], step1[8]);
-          step3[9] = SUB_EPI16(step1[8], step2[9]);
-          step3[10] = SUB_EPI16(step1[11], step2[10]);
-          step3[11] = ADD_EPI16(step2[10], step1[11]);
-          step3[12] = ADD_EPI16(step2[13], step1[12]);
-          step3[13] = SUB_EPI16(step1[12], step2[13]);
-          step3[14] = SUB_EPI16(step1[15], step2[14]);
-          step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
-                                             &step3[11], &step3[12], &step3[13],
-                                             &step3[14], &step3[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-          const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-          const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-          const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-          const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-          const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-          const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-          const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-          const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
-                                             &step3[22], &step3[25], &step3[26],
-                                             &step3[29], &step3[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 7
-        {
-          const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-          const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-          const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-          const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-          const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-          const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-          const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-          const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-          const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m128i out_02_4 =
-              _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_5 =
-              _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_4 =
-              _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_5 =
-              _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_4 =
-              _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_5 =
-              _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_4 =
-              _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_5 =
-              _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_4 =
-              _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_5 =
-              _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_4 =
-              _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_5 =
-              _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_4 =
-              _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_5 =
-              _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_4 =
-              _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_5 =
-              _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step1[16] = ADD_EPI16(step3[17], step2[16]);
-          step1[17] = SUB_EPI16(step2[16], step3[17]);
-          step1[18] = SUB_EPI16(step2[19], step3[18]);
-          step1[19] = ADD_EPI16(step3[18], step2[19]);
-          step1[20] = ADD_EPI16(step3[21], step2[20]);
-          step1[21] = SUB_EPI16(step2[20], step3[21]);
-          step1[22] = SUB_EPI16(step2[23], step3[22]);
-          step1[23] = ADD_EPI16(step3[22], step2[23]);
-          step1[24] = ADD_EPI16(step3[25], step2[24]);
-          step1[25] = SUB_EPI16(step2[24], step3[25]);
-          step1[26] = SUB_EPI16(step2[27], step3[26]);
-          step1[27] = ADD_EPI16(step3[26], step2[27]);
-          step1[28] = ADD_EPI16(step3[29], step2[28]);
-          step1[29] = SUB_EPI16(step2[28], step3[29]);
-          step1[30] = SUB_EPI16(step2[31], step3[30]);
-          step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
-              &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
-              &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
-              &step1[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-          const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-          const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-          const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-          const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-          const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-          const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-          const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-          const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m128i out_01_4 =
-              _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_5 =
-              _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_4 =
-              _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_5 =
-              _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_4 =
-              _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_5 =
-              _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_4 =
-              _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_5 =
-              _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_4 =
-              _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_5 =
-              _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_4 =
-              _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_5 =
-              _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_4 =
-              _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_5 =
-              _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_4 =
-              _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_5 =
-              _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-          const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-          const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-          const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-          const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-          const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-          const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-          const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-          const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m128i out_05_4 =
-              _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_5 =
-              _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_4 =
-              _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_5 =
-              _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_4 =
-              _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_5 =
-              _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_4 =
-              _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_5 =
-              _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_4 =
-              _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_5 =
-              _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_4 =
-              _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_5 =
-              _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_4 =
-              _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_5 =
-              _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_4 =
-              _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_5 =
-              _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m128i lstep1[64], lstep2[64], lstep3[64];
-        __m128i u[32], v[32], sign[16];
-        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
-                                              &v[5], &v[6], &v[7], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm_packs_epi32(u[0], u[1]);
-          out[16] = _mm_packs_epi32(u[2], u[3]);
-          out[8] = _mm_packs_epi32(u[4], u[5]);
-          out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          out[4] = _mm_packs_epi32(u[0], u[1]);
-          out[20] = _mm_packs_epi32(u[2], u[3]);
-          out[12] = _mm_packs_epi32(u[4], u[5]);
-          out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m12_m20 =
-              pair_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
-          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
-          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
-          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
-          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[2] = _mm_packs_epi32(u[0], u[1]);
-          out[18] = _mm_packs_epi32(u[2], u[3]);
-          out[10] = _mm_packs_epi32(u[4], u[5]);
-          out[26] = _mm_packs_epi32(u[6], u[7]);
-          out[6] = _mm_packs_epi32(u[8], u[9]);
-          out[22] = _mm_packs_epi32(u[10], u[11]);
-          out[14] = _mm_packs_epi32(u[12], u[13]);
-          out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
-          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
-          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
-          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
-          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[1] = _mm_packs_epi32(u[0], u[1]);
-          out[17] = _mm_packs_epi32(u[2], u[3]);
-          out[9] = _mm_packs_epi32(u[4], u[5]);
-          out[25] = _mm_packs_epi32(u[6], u[7]);
-          out[7] = _mm_packs_epi32(u[8], u[9]);
-          out[23] = _mm_packs_epi32(u[10], u[11]);
-          out[15] = _mm_packs_epi32(u[12], u[13]);
-          out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
-          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
-          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
-          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
-          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[5] = _mm_packs_epi32(u[0], u[1]);
-          out[21] = _mm_packs_epi32(u[2], u[3]);
-          out[13] = _mm_packs_epi32(u[4], u[5]);
-          out[29] = _mm_packs_epi32(u[6], u[7]);
-          out[3] = _mm_packs_epi32(u[8], u[9]);
-          out[19] = _mm_packs_epi32(u[10], u[11]);
-          out[11] = _mm_packs_epi32(u[12], u[13]);
-          out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-#endif  // FDCT32x32_HIGH_PRECISION
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output0 = &intermediate[column_start * 32];
-        tran_low_t *output1 = &output_org[column_start * 32];
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m128i *this_out = &out[8 * transpose_block];
-          // 00 01 02 03 04 05 06 07
-          // 10 11 12 13 14 15 16 17
-          // 20 21 22 23 24 25 26 27
-          // 30 31 32 33 34 35 36 37
-          // 40 41 42 43 44 45 46 47
-          // 50 51 52 53 54 55 56 57
-          // 60 61 62 63 64 65 66 67
-          // 70 71 72 73 74 75 76 77
-          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00 10 01 11 02 12 03 13
-          // 20 30 21 31 22 32 23 33
-          // 04 14 05 15 06 16 07 17
-          // 24 34 25 35 26 36 27 37
-          // 40 50 41 51 42 52 43 53
-          // 60 70 61 71 62 72 63 73
-          // 54 54 55 55 56 56 57 57
-          // 64 74 65 75 66 76 67 77
-          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 10 20 30 01 11 21 31
-          // 40 50 60 70 41 51 61 71
-          // 02 12 22 32 03 13 23 33
-          // 42 52 62 72 43 53 63 73
-          // 04 14 24 34 05 15 21 36
-          // 44 54 64 74 45 55 61 76
-          // 06 16 26 36 07 17 27 37
-          // 46 56 66 76 47 57 67 77
-          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 10 20 30 40 50 60 70
-          // 01 11 21 31 41 51 61 71
-          // 02 12 22 32 42 52 62 72
-          // 03 13 23 33 43 53 63 73
-          // 04 14 24 34 44 54 64 74
-          // 05 15 25 35 45 55 65 75
-          // 06 16 26 36 46 56 66 76
-          // 07 17 27 37 47 57 67 77
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
-            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
-            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
-            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
-            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
-            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
-            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
-            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/av1_dct.c
-            tr2_0 = _mm_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm_srai_epi16(tr2_7, 2);
-          }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          if (pass == 0) {
-            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
-            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
-            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
-            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
-            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
-            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
-            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
-            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
-            // Process next 8x8
-            output0 += 8;
-          } else {
-            storeu_output(&tr2_0, (output1 + 0 * 32));
-            storeu_output(&tr2_1, (output1 + 1 * 32));
-            storeu_output(&tr2_2, (output1 + 2 * 32));
-            storeu_output(&tr2_3, (output1 + 3 * 32));
-            storeu_output(&tr2_4, (output1 + 4 * 32));
-            storeu_output(&tr2_5, (output1 + 5 * 32));
-            storeu_output(&tr2_6, (output1 + 6 * 32));
-            storeu_output(&tr2_7, (output1 + 7 * 32));
-            // Process next 8x8
-            output1 += 8;
-          }
-        }
-      }
-    }
-  }
-}  // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
deleted file mode 100644
index 670f864d0..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
deleted file mode 100644
index 86df4a6f6..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
-#define AOM_DSP_X86_FWD_TXFM_AVX2_H
-
-#include "./aom_config.h"
-
-static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
-
-    __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
-    __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
-
-    __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
-    __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
-
-    _mm256_storeu_si256((__m256i *)out, y0);
-    _mm256_storeu_si256((__m256i *)(out + 8), y1);
-  } else {
-    _mm256_storeu_si256((__m256i *)out, *coeff);
-  }
-}
-
-#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 7bb1db70a..1e3d13ec8 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -11,7 +11,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
@@ -29,233 +30,6 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 =
-      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
-  __m128i cmp0, cmp1;
-  int test, overflow;
-#endif
-
-  // Load inputs.
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  // in0 = [i0 i1 i2 i3 iC iD iE iF]
-  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-#if DCT_HIGH_BIT_DEPTH
-  // Check inputs small enough to use optimised code
-  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
-  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
-  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
-  if (test) {
-    aom_highbd_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-  // multiply by 16 to give some extra precision
-  in0 = _mm_slli_epi16(in0, 4);
-  in1 = _mm_slli_epi16(in1, 4);
-  // if (i == 0 && input[0]) input[0] += 1;
-  // add 1 to the upper left pixel if it is non-zero, which helps reduce
-  // the round-trip error
-  {
-    // The mask will only contain whether the first value is zero, all
-    // other comparison will fail as something shifted by 4 (above << 4)
-    // can never be equal to one. To increment in the non-zero case, we
-    // add the mask and one for the first element:
-    //   - if zero, mask = -1, v = v - 1 + 1 = v
-    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-    in0 = _mm_add_epi16(in0, mask);
-    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&x0, &x1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-    const __m128i t0 = ADD_EPI16(in0, in1);
-    const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&t0, &t1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    {
-      // The constants needed here are:
-      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-      // Then add and right-shift to get back to 16-bit range
-      // but this combines the final right-shift as well to save operations
-      // This unusual rounding operations is to maintain bit-accurate
-      // compatibility with the c version of this function which has two
-      // rounding steps in a row.
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-      // w0 = [o0 o4 o8 oC]
-      // w1 = [o2 o6 oA oE]
-      // w2 = [o1 o5 o9 oD]
-      // w3 = [o3 o7 oB oF]
-      // remember the o's are numbered according to the correct output location
-      const __m128i x0 = _mm_packs_epi32(w0, w1);
-      const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&x0, &x1);
-      if (overflow) {
-        aom_highbd_fdct4x4_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-        // y1 = [o2 o3 o6 o7 oA oB oE oF]
-        in0 = _mm_unpacklo_epi32(y0, y1);
-        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-        in1 = _mm_unpackhi_epi32(y0, y1);
-        // in1 = [o8 o9 oA oB oC oD oE oF]
-      }
-    }
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  storeu_output(&in0, output + 0 * 4);
-  storeu_output(&in1, output + 2 * 4);
-}
-
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // Constants
@@ -566,449 +340,5 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   }
 }
 
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
-  const int16_t *in = input;
-  int16_t *out0 = intermediate;
-  tran_low_t *out1 = output;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-#if DCT_HIGH_BIT_DEPTH
-    int overflow;
-#endif
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = ADD_EPI16(in00, in15);
-        input1 = ADD_EPI16(in01, in14);
-        input2 = ADD_EPI16(in02, in13);
-        input3 = ADD_EPI16(in03, in12);
-        input4 = ADD_EPI16(in04, in11);
-        input5 = ADD_EPI16(in05, in10);
-        input6 = ADD_EPI16(in06, in09);
-        input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
-                                           &input4, &input5, &input6, &input7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = SUB_EPI16(in07, in08);
-        step1_1 = SUB_EPI16(in06, in09);
-        step1_2 = SUB_EPI16(in05, in10);
-        step1_3 = SUB_EPI16(in04, in11);
-        step1_4 = SUB_EPI16(in03, in12);
-        step1_5 = SUB_EPI16(in02, in13);
-        step1_6 = SUB_EPI16(in01, in14);
-        step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                    &step1_4, &step1_5, &step1_6, &step1_7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = ADD_EPI16(input0, input7);
-        const __m128i q1 = ADD_EPI16(input1, input6);
-        const __m128i q2 = ADD_EPI16(input2, input5);
-        const __m128i q3 = ADD_EPI16(input3, input4);
-        const __m128i q4 = SUB_EPI16(input3, input4);
-        const __m128i q5 = SUB_EPI16(input2, input5);
-        const __m128i q6 = SUB_EPI16(input1, input6);
-        const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = ADD_EPI16(q0, q3);
-          const __m128i r1 = ADD_EPI16(q1, q2);
-          const __m128i r2 = SUB_EPI16(q1, q2);
-          const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          {
-            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-          }
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&r0, &r1);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          {
-            // Add/subtract
-            const __m128i x0 = ADD_EPI16(q4, r0);
-            const __m128i x1 = SUB_EPI16(q4, r0);
-            const __m128i x2 = SUB_EPI16(q7, r1);
-            const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-            // Interleave to do the multiply by constants which gets us
-            // into 32 bits.
-            {
-              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-              overflow =
-                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
-              if (overflow) {
-                aom_highbd_fdct16x16_c(input, output, stride);
-                return;
-              }
-#endif  // DCT_HIGH_BIT_DEPTH
-            }
-          }
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 3
-        {
-          step3_0 = ADD_EPI16(step1_0, step2_3);
-          step3_1 = ADD_EPI16(step1_1, step2_2);
-          step3_2 = SUB_EPI16(step1_1, step2_2);
-          step3_3 = SUB_EPI16(step1_0, step2_3);
-          step3_4 = SUB_EPI16(step1_7, step2_4);
-          step3_5 = SUB_EPI16(step1_6, step2_5);
-          step3_6 = ADD_EPI16(step1_6, step2_5);
-          step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
-                                      &step3_4, &step3_5, &step3_6, &step3_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 5
-        {
-          step1_0 = ADD_EPI16(step3_0, step2_1);
-          step1_1 = SUB_EPI16(step3_0, step2_1);
-          step1_2 = ADD_EPI16(step3_3, step2_2);
-          step1_3 = SUB_EPI16(step3_3, step2_2);
-          step1_4 = SUB_EPI16(step3_4, step2_5);
-          step1_5 = ADD_EPI16(step3_4, step2_5);
-          step1_6 = SUB_EPI16(step3_7, step2_6);
-          step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                      &step1_4, &step1_5, &step1_6, &step1_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
-                              &res06, &res07, pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
-                              &res14, &res15, pass, out0 + 8, out1 + 8);
-      if (pass == 0) {
-        out0 += 8 * 16;
-      } else {
-        out1 += 8 * 16;
-      }
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-  }
-}
-
 #undef ADD_EPI16
 #undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
index 657dcfa22..2d8f8f71e 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -11,40 +11,12 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
-void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0, in1;
-  __m128i tmp;
-  const __m128i zero = _mm_setzero_si128();
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
-  tmp = _mm_add_epi16(in0, in1);
-  in0 = _mm_unpacklo_epi16(zero, tmp);
-  in1 = _mm_unpackhi_epi16(zero, tmp);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(tmp, zero);
-  in1 = _mm_unpackhi_epi32(tmp, zero);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(tmp, 8);
-
-  in1 = _mm_add_epi32(tmp, in0);
-  in0 = _mm_slli_epi32(in1, 1);
-  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
-}
-
 void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
@@ -86,47 +58,12 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
 }
 
 #define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D aom_fdct4x4_sse2
 #define FDCT8x8_2D aom_fdct8x8_sse2
-#define FDCT16x16_2D aom_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
 
-#define FDCT32x32_2D aom_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
 #undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_HIGHBITDEPTH
 #define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D aom_highbd_fdct4x4_sse2
 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#define FDCT16x16_2D aom_highbd_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 58e8971dd..12ccf7f26 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -12,15 +12,10 @@
 #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
 #define AOM_DSP_X86_FWD_TXFM_SSE2_H_
 
-#include "aom_dsp/x86/txfm_common_intrin.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
-
 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
   __m128i buf0, buf1;
   buf0 = _mm_mul_epu32(a, b);
@@ -140,112 +135,6 @@ static INLINE int check_epi16_overflow_x32(
   return res0 + res1;
 }
 
-static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *zero) {
-  __m128i minus_one = _mm_set1_epi32(-1);
-  // Check for overflows
-  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
-  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
-  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
-  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
-  __m128i reg0_top_dwords =
-      _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg1_top_dwords =
-      _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg2_top_dwords =
-      _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg3_top_dwords =
-      _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
-  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
-  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
-  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
-  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
-  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
-  int overflow_01 =
-      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
-  int overflow_23 =
-      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
-  return (overflow_01 + overflow_23);
-}
-
-static INLINE int k_check_epi32_overflow_8(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-  }
-  return overflow;
-}
-
-static INLINE int k_check_epi32_overflow_16(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-    if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
-      if (!overflow) {
-        overflow =
-            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
-      }
-    }
-  }
-  return overflow;
-}
-
-static INLINE int k_check_epi32_overflow_32(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
-    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
-    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
-    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
-    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
-    const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-    if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
-      if (!overflow) {
-        overflow =
-            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
-        if (!overflow) {
-          overflow =
-              k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
-          if (!overflow) {
-            overflow =
-                k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
-            if (!overflow) {
-              overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
-                                                  preg27, zero);
-              if (!overflow) {
-                overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
-                                                    preg31, zero);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return overflow;
-}
-
 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   if (sizeof(tran_low_t) == 4) {
     const __m128i zero = _mm_setzero_si128();
@@ -259,102 +148,6 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   }
 }
 
-static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
-                                       const __m128i *pmultiplier,
-                                       const __m128i *prounding, int shift) {
-  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
-  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
-  const __m128i v0 = _mm_add_epi32(u0, *prounding);
-  const __m128i v1 = _mm_add_epi32(u1, *prounding);
-  const __m128i w0 = _mm_srai_epi32(v0, shift);
-  const __m128i w1 = _mm_srai_epi32(v1, shift);
-  return _mm_packs_epi32(w0, w1);
-}
-
-static INLINE void transpose_and_output8x8(
-    const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
-    const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
-    const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr,
-    tran_low_t *out1_ptr) {
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  // 40 41 42 43 44 45 46 47
-  // 50 51 52 53 54 55 56 57
-  // 60 61 62 63 64 65 66 67
-  // 70 71 72 73 74 75 76 77
-  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 54 54 55 55 56 56 57 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 21 36
-  // 44 54 64 74 45 55 61 76
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-  if (pass == 0) {
-    _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
-  } else {
-    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
-    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
-    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
-    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
-    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
-    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
-    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
-    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
-  }
-}
-
-void fdct32_8col(__m128i *in0, __m128i *in1);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 8fa1c04d0..c1fb259a1 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -13,10 +13,6 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
 SECTION_RODATA
 
 pw_11585x2: times 8 dw 23170
@@ -32,106 +28,7 @@ TRANSFORM_COEFFS 15137,   6270
 TRANSFORM_COEFFS 16069,   3196
 TRANSFORM_COEFFS  9102,  13623
 
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
 %macro STORE_OUTPUT 2 ; index, result
-%if CONFIG_HIGHBITDEPTH
   ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
   ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
   ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
@@ -144,16 +41,16 @@ SECTION .text
   punpckhwd          m12, m11
   mova               [outputq + 4*%1 +  0], m%2
   mova               [outputq + 4*%1 + 16], m12
-%else
-  mova               [outputq + 2*%1], m%2
-%endif
 %endmacro
 
+SECTION .text
+
+%if ARCH_X86_64
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -180,25 +77,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D  0
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
-
-  STORE_OUTPUT       0, 0
-  STORE_OUTPUT       8, 1
-  STORE_OUTPUT      16, 2
-  STORE_OUTPUT      24, 3
-  STORE_OUTPUT      32, 4
-  STORE_OUTPUT      40, 5
-  STORE_OUTPUT      48, 6
-  STORE_OUTPUT      56, 7
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  paddw m9, m1, m6
+  psubw m1, m6
+
+  paddw m7, m2, m5
+  psubw m2, m5
+
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  STORE_OUTPUT  0,  4
+  STORE_OUTPUT  8,  5
+  STORE_OUTPUT 16,  3
+  STORE_OUTPUT 24,  0
+  STORE_OUTPUT 32, 10
+  STORE_OUTPUT 40,  9
+  STORE_OUTPUT 48,  7
+  STORE_OUTPUT 56,  2
 
   RET
 %endif
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
index 60446b086..99f17ebdf 100644
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
@@ -13,6 +13,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
 ;                                            int ref_stride,
 ;                                            unsigned char *src,
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
index a99c0b40e..2a018c1cf 100644
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
@@ -11,8 +11,9 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
index 133640eb7..e5e3238d5 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -11,8 +11,11 @@
 #include <immintrin.h>
 #include <string.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
 
 // -----------------------------------------------------------------------------
 // Copy and average
@@ -100,103 +103,258 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
-                                  int width, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m256i s[8], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
-      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
-      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
-      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
-      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    __m256i p0, p1, u0, u1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1, u0, u1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
-
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
-                          _mm256_avg_epu16(p1, u1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1, u0, u1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm_loadu_si128((const __m128i *)dst);
-      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
-
-      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
-      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1, u0, u1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm_loadl_epi64((const __m128i *)dst);
-      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
-
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[4], coeffs_x[4];
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+                                  round_shift_bits);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+                                 round_shift_bits);
+
+      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+      res = _mm256_min_epi16(res, clip_pixel);
+      res = _mm256_max_epi16(res, zero);
+
+      if (w - j > 4) {
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else if (w == 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else {
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                     _mm256_castsi256_si128(res));
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                     _mm256_extracti128_si256(res, 1));
+      }
+    }
   }
 }
 
+#define CONV8_ROUNDING_BITS (7)
+
 // -----------------------------------------------------------------------------
 // Horizontal and vertical filtering
 
-#define CONV8_ROUNDING_BITS (7)
-
 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
@@ -817,250 +975,6 @@ static void aom_highbd_filter_block1d8_v2_avx2(
   } while (height > 0);
 }
 
-// Calculation with averaging the input pixels
-
-static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
-                                        uint16_t *dst) {
-  const __m128i a0 = _mm256_castsi256_si128(*y0);
-  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
-  __m128i res = _mm_packus_epi32(a0, a1);
-  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
-  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
-  res = _mm_avg_epu16(res, pix);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                        const __m256i *mask, uint16_t *dst,
-                                        ptrdiff_t pitch) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
-  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
-  const __m256i pix =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
-  a = _mm256_min_epi16(a, *mask);
-  a = _mm256_avg_epu16(a, pix);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
-  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
-}
-
-static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                         const __m256i *mask, uint16_t *dst) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
-  a = _mm256_min_epi16(a, *mask);
-  a = _mm256_avg_epu16(a, pix);
-  _mm256_storeu_si256((__m256i *)dst, a);
-}
-
-static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                         const __m256i *mask, uint16_t *dst,
-                                         ptrdiff_t pitch) {
-  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
-  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
-  __m256i p = _mm256_min_epi16(*y0, *mask);
-  p = _mm256_avg_epu16(p, pix0);
-  _mm256_storeu_si256((__m256i *)dst, p);
-
-  p = _mm256_min_epi16(*y1, *mask);
-  p = _mm256_avg_epu16(p, pix1);
-  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
-}
-
-static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
-                                               const __m128i *y1,
-                                               const __m128i *mask,
-                                               uint16_t *dst) {
-  __m128i res = _mm_packus_epi32(*y0, *y1);
-  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
-  res = _mm_min_epi16(res, *mask);
-  res = _mm_avg_epu16(res, pix);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static void aom_highbd_filter_block1d8_h8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_pixels(src_ptr, src_pitch, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    store_8x1_avg_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_v8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[9], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_8x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_8x9_pixels(src_ptr, src_pitch, signal);
-
-    filter_8x9_pixels(signal, ff, &res0, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d16_v8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[17], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_16x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_16x9_pixels(src_ptr, src_pitch, signal);
-    filter_16x9_pixels(signal, ff, &res0, &res1);
-    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_16x9_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_h2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_2t_pixels(src_ptr, signal);
-    filter_8x1_2t_pixels(signal, &ff, &res0);
-    store_8x1_avg_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_2t_pixels(src_ptr, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d16_v2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[3], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-  __m256i ff;
-
-  pack_2t_filter(filter, &ff);
-  pack_16x2_init(src_ptr, signal);
-
-  do {
-    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_v2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m128i signal[3], res0, res1;
-  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
-  __m128i ff;
-
-  pack_8x1_2t_filter(filter, &ff);
-  pack_8x2_init(src_ptr, signal);
-
-  do {
-    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
-    filter_8_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
                                         ptrdiff_t, uint32_t, const int16_t *,
                                         int);
@@ -1080,32 +994,5 @@ void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
 
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
-
-void aom_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-#define aom_highbd_filter_block1d4_h8_avg_avx2 \
-  aom_highbd_filter_block1d4_h8_avg_sse2
-#define aom_highbd_filter_block1d4_h2_avg_avx2 \
-  aom_highbd_filter_block1d4_h2_avg_sse2
-#define aom_highbd_filter_block1d4_v8_avg_avx2 \
-  aom_highbd_filter_block1d4_v8_avg_sse2
-#define aom_highbd_filter_block1d4_v2_avg_avx2 \
-  aom_highbd_filter_block1d4_v2_avg_sse2
-
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
 
 #undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 000000000..f7ac9b496
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m128i s[16], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m128i s[4], coeffs_x[4];
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                 round_shift_bits);
+        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                round_shift_bits);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        res = _mm_min_epi16(res, clip_pixel);
+        res = _mm_max_epi16(res, zero);
+
+        if (w - j > 4) {
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        } else if (w == 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+        } else {
+          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
deleted file mode 100644
index e001a1d70..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "aom_ports/msvc.h"
-#include "./aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-// D45E_PRED
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
-                                 const __m256i *z) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i a = _mm256_avg_epu16(*x, *z);
-  const __m256i b =
-      _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
-  return _mm256_avg_epu16(b, *y);
-}
-
-static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
-                            const __m256i *a2, uint16_t **dst,
-                            ptrdiff_t stride) {
-  const __m256i y = avg3_epu16(a0, a1, a2);
-  _mm256_storeu_si256((__m256i *)*dst, y);
-  *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 9);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
-  x0 = _mm256_insert_epi16(x0, above[23], 15);
-  const __m256i y = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 15);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
-  d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  x2 = _mm256_insert_epi16(x2, above[31], 15);
-  const __m256i y = avg3_epu16(&x0, &x1, &x2);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 33);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
-  x0 = _mm256_insert_epi16(x0, above[47], 15);
-  const __m256i y = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
-
-  uint16_t *dst1 = dst;
-  uint16_t *dst2 = dst + 16;
-
-  d45e_w16(&x0, &x1, &x2, &dst1, stride);
-  d45e_w16(&y0, &y1, &y2, &dst2, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x1, &x2, &x0, &dst1, stride);
-    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x2, &x0, &x1, &dst1, stride);
-    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x0, &x1, &x2, &dst1, stride);
-    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y0, &y1, &y2, &dst2, stride);
-  } while (i < 15);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
-  d45e_w16(&x1, &x2, &x0, &dst1, stride);
-  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
-  d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  d45e_w16(&x2, &x0, &x1, &dst1, stride);
-  y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
-  d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i u = avg3_epu16(&x0, &x1, &x2);
-  _mm256_storeu_si256((__m256i *)dst1, u);
-
-  y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
-  y2 = _mm256_insert_epi16(y2, above[47], 15);
-  u = avg3_epu16(&y0, &y1, &y2);
-  _mm256_storeu_si256((__m256i *)dst2, u);
-}
-
-void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
-
-  uint16_t *dst1 = dst;
-  uint16_t *dst2 = dst + 16;
-
-  d45e_w16(&x0, &x1, &x2, &dst1, stride);
-  d45e_w16(&y0, &y1, &y2, &dst2, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x1, &x2, &x0, &dst1, stride);
-    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x2, &x0, &x1, &dst1, stride);
-    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x0, &x1, &x2, &dst1, stride);
-    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y0, &y1, &y2, &dst2, stride);
-  } while (i < 33);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
-  __m256i u = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst1, u);
-
-  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
-  y0 = _mm256_insert_epi16(y0, above[63], 15);
-  u = avg3_epu16(&y1, &y2, &y0);
-  _mm256_storeu_si256((__m256i *)dst2, u);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
index 691e166cf..5a55736c4 100644
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -11,7 +11,7 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 // -----------------------------------------------------------------------------
 // H_PRED
@@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
     dst += stride;
   }
 }
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
-                                 const __m128i *z) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a = _mm_avg_epu16(*x, *z);
-  const __m128i b =
-      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
-  return _mm_avg_epu16(b, *y);
-}
-
-void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
-  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
-  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
-  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
-  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
-  const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
-  const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
-  const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
-  const __m128i row0 = _mm_srli_si128(avg2, 6);
-  const __m128i row1 = _mm_srli_si128(avg3, 4);
-  const __m128i row2 = _mm_srli_si128(avg2, 4);
-  const __m128i row3 = _mm_srli_si128(avg3, 2);
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-
-  dst -= stride;
-  dst[0] = _mm_extract_epi16(avg3, 1);
-  dst[stride] = _mm_extract_epi16(avg3, 0);
-}
-
-void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
-  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
-  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
-  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
-  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
-  const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
-  const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
-  const __m128i row0 = _mm_srli_si128(avg3, 6);
-  const __m128i row1 = _mm_srli_si128(avg3, 4);
-  const __m128i row2 = _mm_srli_si128(avg3, 2);
-  const __m128i row3 = avg3;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
-  const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
-  const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
-  const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
-  const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
-  const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
-  const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
-  const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
-  const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
-  const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
-  const __m128i row2 = _mm_srli_si128(row3, 4);
-  const __m128i row1 = _mm_srli_si128(row3, 8);
-  const __m128i row0 = _mm_srli_si128(avg3, 4);
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst[0] = _mm_extract_epi16(avg2, 3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
-  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
-  __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
-  CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
-  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
-  (void)left;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-}
-
-void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i h76543210 = _mm_load_si128((const __m128i *)above);
-  __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
-  __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
-  __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
-  __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
-  __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
-  dst += stride;
-
-  // hcba98765
-  h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
-  h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
-  // hxcba9876
-  hx7654321 = _mm_srli_si128(h76543210, 2);
-  // hxxcba987
-  hx8765432 = _mm_srli_si128(h76543210, 4);
-  avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-}
-
-void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-  __m128i y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 3));
-  y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x1 = _mm_loadu_si128((const __m128i *)(above + 4));
-  y = avg3_epu16(&x2, &x0, &x1);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x2 = _mm_loadu_si128((const __m128i *)(above + 5));
-  x2 = _mm_insert_epi16(x2, above[11], 7);
-  y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-}
-
-static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
-                           const __m128i *a2, uint16_t **dst,
-                           ptrdiff_t stride) {
-  const __m128i y = avg3_epu16(a0, a1, a2);
-  _mm_storeu_si128((__m128i *)*dst, y);
-  *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
-  d45e_w8(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x0, &x1, &x2, &dst, stride);
-  } while (i < 9);
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 9));
-  x0 = _mm_insert_epi16(x0, above[15], 7);
-  const __m128i y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
-  d45e_w8(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x0, &x1, &x2, &dst, stride);
-  } while (i < 15);
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 15));
-  __m128i y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x1 = _mm_loadu_si128((const __m128i *)(above + 16));
-  y = avg3_epu16(&x2, &x0, &x1);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x2 = _mm_loadu_si128((const __m128i *)(above + 17));
-  x2 = _mm_insert_epi16(x2, above[23], 7);
-  y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
deleted file mode 100644
index b089a3f43..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
-                                 const __m128i *z) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a = _mm_avg_epu16(*x, *z);
-  const __m128i b =
-      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
-  return _mm_avg_epu16(b, *y);
-}
-
-DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
-  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
-};
-
-static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
-  *a = _mm_shuffle_epi8(*a, *rotrw);
-  return *a;
-}
-
-void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i IXABCDEF =
-      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
-  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
-  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
-  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
-  __m128i rowa = avg2;
-  __m128i rowb = avg3;
-  int i;
-  (void)bd;
-  for (i = 0; i < 8; i += 2) {
-    _mm_store_si128((__m128i *)dst, rowa);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, rowb);
-    dst += stride;
-    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
-    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
-  }
-}
-
-void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A0 = _mm_load_si128((const __m128i *)above);
-  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
-  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
-  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
-  const __m128i L1_ = _mm_srli_si128(L1, 2);
-  __m128i rowa_0 = avg2_0;
-  __m128i rowa_1 = avg2_1;
-  __m128i rowb_0 = avg3_0;
-  __m128i rowb_1 = avg3_1;
-  __m128i avg3_left[2];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
-  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
-  for (i = 0; i < 2; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; j += 2) {
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      dst += stride;
-      _mm_store_si128((__m128i *)dst, rowb_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
-      dst += stride;
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
-      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
-    }
-  }
-}
-
-void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_load_si128((const __m128i *)above);
-  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
-  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
-  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
-  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
-  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
-  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
-  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
-  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
-  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
-  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
-  const __m128i L3_ = _mm_srli_si128(L3, 2);
-  __m128i rowa_0 = avg2_0;
-  __m128i rowa_1 = avg2_1;
-  __m128i rowa_2 = avg2_2;
-  __m128i rowa_3 = avg2_3;
-  __m128i rowb_0 = avg3_0;
-  __m128i rowb_1 = avg3_1;
-  __m128i rowb_2 = avg3_2;
-  __m128i rowb_3 = avg3_3;
-  __m128i avg3_left[4];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
-  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
-  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
-  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
-  for (i = 0; i < 4; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; j += 2) {
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
-      dst += stride;
-      _mm_store_si128((__m128i *)dst, rowb_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
-      dst += stride;
-      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
-      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
-      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
-      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
-      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
-    }
-  }
-}
-
-void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
-  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i AXIJKLMN =
-      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
-  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
-  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
-  __m128i rowa = avg3;
-  int i;
-  (void)bd;
-  for (i = 0; i < 8; ++i) {
-    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
-    _mm_store_si128((__m128i *)dst, rowa);
-    dst += stride;
-  }
-}
-
-void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i B0 = _mm_load_si128((const __m128i *)above);
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
-  const __m128i C1 = _mm_srli_si128(B1, 2);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
-  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
-  __m128i rowa_0 = avg3_0;
-  __m128i rowa_1 = avg3_1;
-  __m128i avg3_left[2];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
-  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
-  for (i = 0; i < 2; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; ++j) {
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i B0 = _mm_load_si128((const __m128i *)above);
-  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
-  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
-  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
-  const __m128i C3 = _mm_srli_si128(B3, 2);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
-  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
-  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
-  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
-  __m128i rowa_0 = avg3_0;
-  __m128i rowa_1 = avg3_1;
-  __m128i rowa_2 = avg3_2;
-  __m128i rowa_3 = avg3_3;
-  __m128i avg3_left[4];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
-  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
-  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
-  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
-  for (i = 0; i < 4; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; ++j) {
-      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
-      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
-  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
-  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i AXIJKLMN =
-      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
-  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
-  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
-  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
-  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
-  const __m128i row0 =
-      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
-  const __m128i row1 =
-      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
-  const __m128i row2 =
-      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
-  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
-  const __m128i row4 =
-      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
-  const __m128i row5 =
-      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
-  const __m128i row6 =
-      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
-  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, row0);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row1);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row2);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row3);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row4);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row5);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row6);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row7);
-}
-
-void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
-  const __m128i B1 = _mm_srli_si128(A1, 2);
-  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
-  const __m128i C1 = _mm_srli_si128(A1, 4);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
-  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
-  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
-  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
-  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
-  __m128i row_0 = avg3_0;
-  __m128i row_1 = avg3_1;
-  __m128i avg2_avg3_left[2][2];
-  int i, j;
-  (void)bd;
-
-  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
-
-  for (j = 0; j < 2; ++j) {
-    for (i = 0; i < 2; ++i) {
-      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
-  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
-  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
-  const __m128i B3 = _mm_srli_si128(A3, 2);
-  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
-  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
-  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
-  const __m128i C3 = _mm_srli_si128(A3, 4);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
-  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
-  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
-  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
-  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
-  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
-  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
-  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
-  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
-  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
-  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
-  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
-  __m128i row_0 = avg3_0;
-  __m128i row_1 = avg3_1;
-  __m128i row_2 = avg3_2;
-  __m128i row_3 = avg3_3;
-  __m128i avg2_avg3_left[4][2];
-  int i, j;
-  (void)bd;
-
-  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
-  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
-  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
-  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
-
-  for (j = 0; j < 4; ++j) {
-    for (i = 0; i < 2; ++i) {
-      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
index 94c68885c..c954da94e 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -11,210 +11,26 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/common_avx2.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 #include "aom/aom_integer.h"
 
-#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
-                             const uint8_t *t, int bd, __m256i *blt,
-                             __m256i *lt, __m256i *thr) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
-  __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *blt = _mm256_slli_epi16(y, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
-  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *lt = _mm256_slli_epi16(y, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
-  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *thr = _mm256_slli_epi16(y, shift);
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
-                                     __m256i *p, __m256i *q) {
-  int i;
-  for (i = 0; i < size; i++) {
-    p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch));
-    q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch));
-  }
-}
-
-static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q,
-                                   const __m256i *t, __m256i *hev) {
-  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0]));
-  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0]));
-  __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm256_subs_epu16(h, *t);
-
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  const __m256i zero = _mm256_setzero_si256();
-  *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff);
-}
-
-static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q,
-                                      const __m256i *l, const __m256i *bl,
-                                      __m256i *mask) {
-  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0]));
-  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1]));
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff);
-  max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one));
-
-  int i;
-  for (i = 1; i < 4; ++i) {
-    max = _mm256_max_epi16(max,
-                           _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1])));
-    max = _mm256_max_epi16(max,
-                           _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1])));
-  }
-  max = _mm256_subs_epu16(max, *l);
-  *mask = _mm256_cmpeq_epi16(max, zero);  // return ~mask
-}
-
-static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p,
-                                      const __m256i *q, int bd, int start,
-                                      int end, __m256i *flat) {
-  __m256i max = _mm256_setzero_si256();
-  int i;
-  for (i = start; i < end; ++i) {
-    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0])));
-    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0])));
-  }
-
-  __m256i ft;
-  if (bd == 8)
-    ft = _mm256_subs_epu16(max, *th);
-  else if (bd == 10)
-    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2));
-  else  // bd == 12
-    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4));
-
-  const __m256i zero = _mm256_setzero_si256();
-  *flat = _mm256_cmpeq_epi16(ft, zero);
-}
-
-// Note:
-//  Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p,
-                                     const __m256i *q, __m256i *flat, int bd) {
-  // check the distance 1,2,3 against 0
-  flat_mask_internal(th, p, q, bd, 1, 4, flat);
-}
-
-// Note:
-//  access p[7-4], p[0], and q[7-4], q[0]
-static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p,
-                                     const __m256i *q, __m256i *flat, int bd) {
-  flat_mask_internal(th, p, q, bd, 4, 8, flat);
-}
-
-static INLINE void pixel_clamp(const __m256i *min, const __m256i *max,
-                               __m256i *pixel) {
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(*pixel, *max);
-  clamped = _mm256_andnot_si256(mask, *pixel);
-  mask = _mm256_and_si256(mask, *max);
-  clamped = _mm256_or_si256(mask, clamped);
-
-  mask = _mm256_cmpgt_epi16(clamped, *min);
-  clamped = _mm256_and_si256(mask, clamped);
-  mask = _mm256_andnot_si256(mask, *min);
-  *pixel = _mm256_or_si256(clamped, mask);
-}
-
-static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
-                                  const __m256i *th, int bd, __m256i *ps,
-                                  __m256i *qs) {
-  __m256i t80;
-  if (bd == 8)
-    t80 = _mm256_set1_epi16(0x80);
-  else if (bd == 10)
-    t80 = _mm256_set1_epi16(0x200);
-  else  // bd == 12
-    t80 = _mm256_set1_epi16(0x800);
-
-  __m256i ps0 = _mm256_subs_epi16(p[0], t80);
-  __m256i ps1 = _mm256_subs_epi16(p[1], t80);
-  __m256i qs0 = _mm256_subs_epi16(q[0], t80);
-  __m256i qs1 = _mm256_subs_epi16(q[1], t80);
-
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filter = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filter);
-
-  __m256i hev;
-  highbd_hev_mask(p, q, th, &hev);
-  filter = _mm256_and_si256(filter, hev);
-
-  const __m256i x = _mm256_subs_epi16(qs0, ps0);
-  filter = _mm256_adds_epi16(filter, x);
-  filter = _mm256_adds_epi16(filter, x);
-  filter = _mm256_adds_epi16(filter, x);
-  pixel_clamp(&pmin, &pmax, &filter);
-  filter = _mm256_and_si256(filter, *mask);
-
-  const __m256i t3 = _mm256_set1_epi16(3);
-  const __m256i t4 = _mm256_set1_epi16(4);
-
-  __m256i filter1 = _mm256_adds_epi16(filter, t4);
-  __m256i filter2 = _mm256_adds_epi16(filter, t3);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter1 = _mm256_srai_epi16(filter1, 3);
-  filter2 = _mm256_srai_epi16(filter2, 3);
-
-  qs0 = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &qs0);
-  ps0 = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &ps0);
-
-  qs[0] = _mm256_adds_epi16(qs0, t80);
-  ps[0] = _mm256_adds_epi16(ps0, t80);
-
-  filter = _mm256_adds_epi16(filter1, one);
-  filter = _mm256_srai_epi16(filter, 1);
-  filter = _mm256_andnot_si256(hev, filter);
-
-  qs1 = _mm256_subs_epi16(qs1, filter);
-  pixel_clamp(&pmin, &pmax, &qs1);
-  ps1 = _mm256_adds_epi16(ps1, filter);
-  pixel_clamp(&pmin, &pmax, &ps1);
-
-  qs[1] = _mm256_adds_epi16(qs1, t80);
-  ps[1] = _mm256_adds_epi16(ps1, t80);
-}
-#endif  // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
-
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
 }
 
-void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blt, const uint8_t *lt,
-                                          const uint8_t *thr, int bd) {
-  aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
 }
 
 void aom_highbd_lpf_horizontal_4_dual_avx2(
@@ -248,626 +64,3 @@ void aom_highbd_lpf_vertical_8_dual_avx2(
   aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
                                       limit1, thresh1, bd);
 }
-#else
-void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  __m256i blimit, limit, thresh;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
-
-  __m256i p[8], q[8];
-  load_highbd_pixel(s, 8, pitch, p, q);
-
-  __m256i mask;
-  highbd_filter_mask(p, q, &limit, &blimit, &mask);
-
-  __m256i flat, flat2;
-  const __m256i one = _mm256_set1_epi16(1);
-  highbd_flat_mask4(&one, p, q, &flat, bd);
-  highbd_flat_mask5(&one, p, q, &flat2, bd);
-
-  flat = _mm256_and_si256(flat, mask);
-  flat2 = _mm256_and_si256(flat2, flat);
-
-  __m256i ps[2], qs[2];
-  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
-
-  // flat and wide flat calculations
-  __m256i flat_p[3], flat_q[3];
-  __m256i flat2_p[7], flat2_q[7];
-  {
-    const __m256i eight = _mm256_set1_epi16(8);
-    const __m256i four = _mm256_set1_epi16(4);
-
-    __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]),
-                                     _mm256_add_epi16(p[4], p[3]));
-    __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]),
-                                     _mm256_add_epi16(q[4], q[3]));
-
-    __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1]));
-    sum_p = _mm256_add_epi16(sum_p, sum_lp);
-
-    __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1]));
-    sum_q = _mm256_add_epi16(sum_q, sum_lq);
-    sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q));
-    sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq));
-
-    flat2_p[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4);
-    flat2_q[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4);
-    flat_p[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3);
-    flat_q[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3);
-
-    __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]);
-    __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]);
-    __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]);
-    __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]);
-
-    sum_q = _mm256_sub_epi16(sum_p, p[6]);
-    sum_p = _mm256_sub_epi16(sum_p, q[6]);
-    flat2_p[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4);
-    flat2_q[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4);
-
-    sum_lq = _mm256_sub_epi16(sum_lp, p[2]);
-    sum_lp = _mm256_sub_epi16(sum_lp, q[2]);
-    flat_p[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3);
-    flat_q[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3);
-
-    sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
-    sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
-    sum_p3 = _mm256_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm256_add_epi16(sum_q3, q[3]);
-
-    sum_p = _mm256_sub_epi16(sum_p, q[5]);
-    sum_q = _mm256_sub_epi16(sum_q, p[5]);
-    flat2_p[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4);
-    flat2_q[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4);
-
-    sum_lp = _mm256_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm256_sub_epi16(sum_lq, p[1]);
-    flat_p[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3);
-    flat_q[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3);
-
-    int i;
-    for (i = 3; i < 7; ++i) {
-      sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
-      sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
-      sum_p = _mm256_sub_epi16(sum_p, q[7 - i]);
-      sum_q = _mm256_sub_epi16(sum_q, p[7 - i]);
-      flat2_p[i] = _mm256_srli_epi16(
-          _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4);
-      flat2_q[i] = _mm256_srli_epi16(
-          _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4);
-    }
-  }
-
-  // highbd_filter8
-  p[2] = _mm256_andnot_si256(flat, p[2]);
-  //  p2 remains unchanged if !(flat && mask)
-  flat_p[2] = _mm256_and_si256(flat, flat_p[2]);
-  //  when (flat && mask)
-  p[2] = _mm256_or_si256(p[2], flat_p[2]);  // full list of p2 values
-  q[2] = _mm256_andnot_si256(flat, q[2]);
-  flat_q[2] = _mm256_and_si256(flat, flat_q[2]);
-  q[2] = _mm256_or_si256(q[2], flat_q[2]);  // full list of q2 values
-
-  int i;
-  for (i = 1; i >= 0; i--) {
-    ps[i] = _mm256_andnot_si256(flat, ps[i]);
-    flat_p[i] = _mm256_and_si256(flat, flat_p[i]);
-    p[i] = _mm256_or_si256(ps[i], flat_p[i]);
-    qs[i] = _mm256_andnot_si256(flat, qs[i]);
-    flat_q[i] = _mm256_and_si256(flat, flat_q[i]);
-    q[i] = _mm256_or_si256(qs[i], flat_q[i]);
-  }
-
-  // highbd_filter16
-
-  for (i = 6; i >= 0; i--) {
-    //  p[i] remains unchanged if !(flat2 && flat && mask)
-    p[i] = _mm256_andnot_si256(flat2, p[i]);
-    flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]);
-    //  get values for when (flat2 && flat && mask)
-    p[i] = _mm256_or_si256(p[i], flat2_p[i]);  // full list of p values
-
-    q[i] = _mm256_andnot_si256(flat2, q[i]);
-    flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]);
-    q[i] = _mm256_or_si256(q[i], flat2_q[i]);
-    _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]);
-    _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]);
-  }
-}
-
-static INLINE void highbd_transpose16x16(uint16_t *src, int src_p,
-                                         uint16_t *dst, int dst_p) {
-  __m256i x[16];
-  int i;
-  for (i = 0; i < 16; ++i) {
-    x[i] = _mm256_loadu_si256((const __m256i *)src);
-    src += src_p;
-  }
-  mm256_transpose_16x16(x, x);
-  for (i = 0; i < 16; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, x[i]);
-    dst += dst_p;
-  }
-}
-
-void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
-
-  //  Transpose 16x16
-  highbd_transpose16x16(s - 8, p, t_dst, 16);
-
-  //  Loop filtering
-  aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, bd);
-
-  //  Transpose back
-  highbd_transpose16x16(t_dst, 16, s - 8, p);
-}
-
-static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0,
-                                  const uint8_t *t0, const uint8_t *b1,
-                                  const uint8_t *l1, const uint8_t *t1, int bd,
-                                  __m256i *blt, __m256i *lt, __m256i *thr) {
-  const __m128i z128 = _mm_setzero_si128();
-  const __m128i blimit0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128);
-  const __m128i limit0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128);
-  const __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128);
-  const __m128i blimit1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128);
-  const __m128i limit1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128);
-  const __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128);
-
-  *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1);
-  *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1);
-  *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1);
-
-  int shift = bd - 8;
-  *blt = _mm256_slli_epi16(*blt, shift);
-  *lt = _mm256_slli_epi16(*lt, shift);
-  *thr = _mm256_slli_epi16(*thr, shift);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
-  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
-  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
-  __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p));
-  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
-  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
-
-  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
-  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
-
-  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
-  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
-
-  __m256i blimit, limit, thresh;
-  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit, &limit, &thresh);
-
-  __m256i t80, tff80, tffe0, t1f, t7f;
-  if (bd == 8) {
-    t80 = _mm256_set1_epi16(0x80);
-    tff80 = _mm256_set1_epi16(0xff80);
-    tffe0 = _mm256_set1_epi16(0xffe0);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8);
-  } else if (bd == 10) {
-    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2);
-    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2);
-    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6);
-  } else {  // bd == 12
-    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4);
-    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4);
-    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4);
-  }
-
-  __m256i ps1 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80);
-  __m256i ps0 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80);
-  __m256i qs0 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80);
-  __m256i qs1 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80);
-
-  // filter_mask and hev_mask
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  __m256i hev = _mm256_subs_epu16(flat, thresh);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-  __m256i mask =
-      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  const __m256i one = _mm256_set1_epi16(1);
-  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
-  mask = _mm256_max_epi16(flat, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-  __m256i work = _mm256_max_epi16(
-      _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)),
-      _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3)));
-  mask = _mm256_max_epi16(work, mask);
-  work = _mm256_max_epi16(
-      _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)),
-      _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3)));
-  mask = _mm256_max_epi16(work, mask);
-  mask = _mm256_subs_epu16(mask, limit);
-  mask = _mm256_cmpeq_epi16(mask, zero);
-
-  // filter4
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filt = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, hev);
-  __m256i work_a = _mm256_subs_epi16(qs0, ps0);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  pixel_clamp(&pmin, &pmax, &filt);
-
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm256_and_si256(filt, mask);
-
-  const __m256i t4 = _mm256_set1_epi16(4);
-  const __m256i t3 = _mm256_set1_epi16(3);
-
-  __m256i filter1 = _mm256_adds_epi16(filt, t4);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  __m256i filter2 = _mm256_adds_epi16(filt, t3);
-  pixel_clamp(&pmin, &pmax, &filter2);
-
-  // Filter1 >> 3
-  work_a = _mm256_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm256_srli_epi16(filter1, 3);
-  work_a = _mm256_and_si256(work_a, tffe0);    // sign bits for the values < 0
-  filter1 = _mm256_and_si256(filter1, t1f);    // clamp the range
-  filter1 = _mm256_or_si256(filter1, work_a);  // reinsert the sign bits
-
-  // Filter2 >> 3
-  work_a = _mm256_cmpgt_epi16(zero, filter2);
-  filter2 = _mm256_srli_epi16(filter2, 3);
-  work_a = _mm256_and_si256(work_a, tffe0);
-  filter2 = _mm256_and_si256(filter2, t1f);
-  filter2 = _mm256_or_si256(filter2, work_a);
-
-  // filt >> 1
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  filt = _mm256_adds_epi16(filter1, one);
-  work_a = _mm256_cmpgt_epi16(zero, filt);
-  filt = _mm256_srli_epi16(filt, 1);
-  work_a = _mm256_and_si256(work_a, tff80);
-  filt = _mm256_and_si256(filt, t7f);
-  filt = _mm256_or_si256(filt, work_a);
-
-  filt = _mm256_andnot_si256(hev, filt);
-
-  filter1 = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  q0 = _mm256_adds_epi16(filter1, t80);
-
-  filter1 = _mm256_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  q1 = _mm256_adds_epi16(filter1, t80);
-
-  filter2 = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  p0 = _mm256_adds_epi16(filter2, t80);
-
-  filter2 = _mm256_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  p1 = _mm256_adds_epi16(filter2, t80);
-
-  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
-  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
-  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
-  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-
-  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
-  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
-  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
-  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
-  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
-  __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p));
-
-  __m256i blimit, limit, thresh;
-  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit, &limit, &thresh);
-
-  __m256i t80;
-  if (bd == 8) {
-    t80 = _mm256_set1_epi16(0x80);
-  } else if (bd == 10) {
-    t80 = _mm256_set1_epi16(0x200);
-  } else {  // bd == 12
-    t80 = _mm256_set1_epi16(0x800);
-  }
-
-  __m256i ps1, ps0, qs0, qs1;
-  ps1 = _mm256_subs_epi16(p1, t80);
-  ps0 = _mm256_subs_epi16(p0, t80);
-  qs0 = _mm256_subs_epi16(q0, t80);
-  qs1 = _mm256_subs_epi16(q1, t80);
-
-  // filter_mask and hev_mask
-  __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
-  abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
-
-  abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
-  abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
-  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  __m256i hev = _mm256_subs_epu16(flat, thresh);
-  const __m256i zero = _mm256_set1_epi16(0);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-  __m256i mask =
-      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-
-  const __m256i one = _mm256_set1_epi16(1);
-  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
-  mask = _mm256_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm256_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q1)));
-  mask = _mm256_max_epi16(work, mask);
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q2)));
-  mask = _mm256_max_epi16(work, mask);
-  mask = _mm256_subs_epu16(mask, limit);
-  mask = _mm256_cmpeq_epi16(mask, zero);
-
-  // flat_mask4
-  flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q0)));
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q0)));
-  flat = _mm256_max_epi16(work, flat);
-  flat = _mm256_max_epi16(abs_p1p0, flat);
-  flat = _mm256_max_epi16(abs_q1q0, flat);
-
-  if (bd == 8)
-    flat = _mm256_subs_epu16(flat, one);
-  else if (bd == 10)
-    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2));
-  else  // bd == 12
-    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4));
-
-  flat = _mm256_cmpeq_epi16(flat, zero);
-  flat = _mm256_and_si256(flat, mask);  // flat & mask
-
-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
-  __m256i workp_a, workp_b, workp_shft;
-  workp_a =
-      _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1));
-  const __m256i four = _mm256_set1_epi16(4);
-  workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0);
-  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft);
-
-  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft);
-
-  // lp filter
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filt, filter1, filter2, work_a;
-  filt = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, hev);
-  work_a = _mm256_subs_epi16(qs0, ps0);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, mask);
-
-  const __m256i t4 = _mm256_set1_epi16(4);
-  const __m256i t3 = _mm256_set1_epi16(3);
-
-  filter1 = _mm256_adds_epi16(filt, t4);
-  filter2 = _mm256_adds_epi16(filt, t3);
-
-  // Filter1 >> 3
-  pixel_clamp(&pmin, &pmax, &filter1);
-  filter1 = _mm256_srai_epi16(filter1, 3);
-
-  // Filter2 >> 3
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter2 = _mm256_srai_epi16(filter2, 3);
-
-  // filt >> 1
-  filt = _mm256_adds_epi16(filter1, one);
-  filt = _mm256_srai_epi16(filt, 1);
-  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-  filt = _mm256_andnot_si256(hev, filt);
-
-  work_a = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  q0 = _mm256_loadu_si256((__m256i *)flat_oq0);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q0 = _mm256_and_si256(flat, q0);
-  q0 = _mm256_or_si256(work_a, q0);
-
-  work_a = _mm256_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  q1 = _mm256_loadu_si256((__m256i *)flat_oq1);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q1 = _mm256_and_si256(flat, q1);
-  q1 = _mm256_or_si256(work_a, q1);
-
-  work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  q2 = _mm256_loadu_si256((__m256i *)flat_oq2);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q2 = _mm256_and_si256(flat, q2);
-  q2 = _mm256_or_si256(work_a, q2);
-
-  work_a = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  p0 = _mm256_loadu_si256((__m256i *)flat_op0);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p0 = _mm256_and_si256(flat, p0);
-  p0 = _mm256_or_si256(work_a, p0);
-
-  work_a = _mm256_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  p1 = _mm256_loadu_si256((__m256i *)flat_op1);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p1 = _mm256_and_si256(flat, p1);
-  p1 = _mm256_or_si256(work_a, p1);
-
-  work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  p2 = _mm256_loadu_si256((__m256i *)flat_op2);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p2 = _mm256_and_si256(flat, p2);
-  p2 = _mm256_or_si256(work_a, p2);
-
-  _mm256_storeu_si256((__m256i *)(s - 3 * p), p2);
-  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
-  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
-  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
-  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
-  _mm256_storeu_si256((__m256i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
-
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
-
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
-#endif  // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 0a399edf2..83e0098ba 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -11,29 +11,23 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom_ports/emmintrin_compat.h"
-#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
 
-static INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
-                               __m128i *pixel) {
-  __m128i clamped, mask;
+#include "aom_dsp/x86/lpf_common_sse2.h"
 
-  mask = _mm_cmpgt_epi16(*pixel, *max);
-  clamped = _mm_andnot_si128(mask, *pixel);
-  mask = _mm_and_si128(mask, *max);
-  clamped = _mm_or_si128(mask, clamped);
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+                                         __m128i *pixel) {
+  *pixel = _mm_min_epi16(*pixel, *max);
+  *pixel = _mm_max_epi16(*pixel, *min);
+}
 
-  mask = _mm_cmpgt_epi16(clamped, *min);
-  clamped = _mm_and_si128(mask, clamped);
-  mask = _mm_andnot_si128(mask, *min);
-  *pixel = _mm_or_si128(clamped, mask);
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
 }
 
 static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
                              const uint8_t *t, int bd, __m128i *blt,
-                             __m128i *lt, __m128i *thr) {
+                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
   const int shift = bd - 8;
   const __m128i zero = _mm_setzero_si128();
 
@@ -45,6 +39,36 @@ static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
 
   x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
   *thr = _mm_slli_epi16(x, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+    __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+  __m128i x1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *blt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *lt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *thr_out = _mm_slli_epi16(x0, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
 }
 
 static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
@@ -55,115 +79,217 @@ static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
     q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
   }
 }
-// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q,
-                                   const __m128i *t, __m128i *hev) {
-  const __m128i abs_p1p0 =
-      _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1]));
-  const __m128i abs_q1q0 =
-      _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1]));
-  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm_subs_epu16(h, *t);
 
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-  const __m128i zero = _mm_setzero_si128();
-  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-}
-
-static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q,
-                                      const __m128i *l, const __m128i *bl,
-                                      __m128i *mask) {
-  __m128i abs_p0q0 =
-      _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0]));
-  __m128i abs_p1q1 =
-      _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1]));
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+                                           const __m128i *l, const __m128i *bl,
+                                           __m128i *mask) {
+  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
 
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i ffff = _mm_set1_epi16(0xFFFF);
+
   __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
   max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
   max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
 
   int i;
   for (i = 1; i < 4; ++i) {
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]),
-                                          _mm_subs_epu16(p[i - 1], p[i])));
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]),
-                                          _mm_subs_epu16(q[i - 1], q[i])));
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
   }
   max = _mm_subs_epu16(max, *l);
   *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
 }
 
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p,
-                                      const __m128i *q, int bd, int start,
-                                      int end, __m128i *flat) {
-  __m128i max = _mm_setzero_si128();
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+                                                 __m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *abs_p1p0, __m128i *l,
+                                                 __m128i *bl, __m128i *t,
+                                                 __m128i *hev, __m128i *mask) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+  __m128i max, max01, h;
+
+  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
+
+  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+  // mask |= (abs(*p1 - *p0) > limit) * -1;
+  // mask |= (abs(*q1 - *q0) > limit) * -1;
+  h = _mm_subs_epu16(max01, *t);
+
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+  // replicate for the further "merged variables" usage
+  *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+  max = _mm_max_epi16(max, max01);
+  int i;
+  for (i = 2; i < x; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+                                      int start, int end, __m128i *flat) {
+  int i;
+  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+                              abs_diff16(pq[start + 1], pq[0]));
+
+  for (i = start + 2; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+                                           const __m128i *q, int start, int end,
+                                           __m128i *flat) {
   int i;
-  for (i = start; i < end; ++i) {
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]),
-                                          _mm_subs_epu16(p[0], p[i])));
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]),
-                                          _mm_subs_epu16(q[0], q[i])));
+  __m128i max =
+      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+  for (i = start + 1; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
   }
 
   __m128i ft;
-  if (bd == 8)
-    ft = _mm_subs_epu16(max, *th);
-  else if (bd == 10)
-    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2));
-  else  // bd == 12
-    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4));
+  ft = _mm_subs_epu16(max, *th);
 
   const __m128i zero = _mm_setzero_si128();
   *flat = _mm_cmpeq_epi16(ft, zero);
 }
 
-// Note:
-//  Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
-                                     const __m128i *q, __m128i *flat, int bd) {
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+                                          __m128i *flat2, int bd) {
   // check the distance 1,2,3 against 0
-  flat_mask_internal(th, p, q, bd, 1, 4, flat);
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal(&th, pq, 1, 4, flat);
+  flat_mask_internal(&th, pq, 4, 7, flat2);
 }
 
-// Note:
-//  access p[7-4], p[0], and q[7-4], q[0]
-static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p,
-                                     const __m128i *q, __m128i *flat, int bd) {
-  flat_mask_internal(th, p, q, bd, 4, 8, flat);
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+                                               const __m128i *q, __m128i *flat,
+                                               __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
 }
 
-static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
-                                  const __m128i *th, int bd, __m128i *ps,
-                                  __m128i *qs) {
-  __m128i t80;
-  if (bd == 8)
-    t80 = _mm_set1_epi16(0x80);
-  else if (bd == 10)
-    t80 = _mm_set1_epi16(0x200);
-  else  // bd == 12
-    t80 = _mm_set1_epi16(0x800);
-
-  __m128i ps0 = _mm_subs_epi16(p[0], t80);
-  __m128i ps1 = _mm_subs_epi16(p[1], t80);
-  __m128i qs0 = _mm_subs_epi16(q[0], t80);
-  __m128i qs1 = _mm_subs_epi16(q[1], t80);
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *hev, __m128i *mask,
+                                                 __m128i *qs1qs0,
+                                                 __m128i *ps1ps0, __m128i *t80,
+                                                 int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+  __m128i ps1ps0_work, qs1qs0_work, work;
+  __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &work);
+  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm_and_si128(filt, *mask);
+  filt = _mm_unpacklo_epi64(filt, filt);
+
+  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+  pixel_clamp(&pmin, &pmax, &filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
 
+  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filt, one);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(*hev, filt);
+
+  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+                                            __m128i *qs, const __m128i *mask,
+                                            const __m128i *th, int bd,
+                                            __m128i *t80) {
+  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
 
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
   __m128i filter = _mm_subs_epi16(ps1, qs1);
   pixel_clamp(&pmin, &pmax, &filter);
 
+  // hev_filter
   __m128i hev;
-  highbd_hev_mask(p, q, th, &hev);
+  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm_subs_epu16(h, *th);
+  const __m128i ffff = _mm_cmpeq_epi16(h, h);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
   filter = _mm_and_si128(filter, hev);
 
   const __m128i x = _mm_subs_epi16(qs0, ps0);
@@ -172,145 +298,332 @@ static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
   filter = _mm_adds_epi16(filter, x);
   pixel_clamp(&pmin, &pmax, &filter);
   filter = _mm_and_si128(filter, *mask);
-
   const __m128i t3 = _mm_set1_epi16(3);
   const __m128i t4 = _mm_set1_epi16(4);
-
   __m128i filter1 = _mm_adds_epi16(filter, t4);
   __m128i filter2 = _mm_adds_epi16(filter, t3);
   pixel_clamp(&pmin, &pmax, &filter1);
   pixel_clamp(&pmin, &pmax, &filter2);
   filter1 = _mm_srai_epi16(filter1, 3);
   filter2 = _mm_srai_epi16(filter2, 3);
-
   qs0 = _mm_subs_epi16(qs0, filter1);
   pixel_clamp(&pmin, &pmax, &qs0);
   ps0 = _mm_adds_epi16(ps0, filter2);
   pixel_clamp(&pmin, &pmax, &ps0);
-
-  qs[0] = _mm_adds_epi16(qs0, t80);
-  ps[0] = _mm_adds_epi16(ps0, t80);
-
+  qs[0] = _mm_adds_epi16(qs0, *t80);
+  ps[0] = _mm_adds_epi16(ps0, *t80);
   filter = _mm_adds_epi16(filter1, one);
   filter = _mm_srai_epi16(filter, 1);
   filter = _mm_andnot_si128(hev, filter);
-
   qs1 = _mm_subs_epi16(qs1, filter);
   pixel_clamp(&pmin, &pmax, &qs1);
   ps1 = _mm_adds_epi16(ps1, filter);
   pixel_clamp(&pmin, &pmax, &ps1);
-
-  qs[1] = _mm_adds_epi16(qs1, t80);
-  ps[1] = _mm_adds_epi16(ps1, t80);
+  qs[1] = _mm_adds_epi16(qs1, *t80);
+  ps[1] = _mm_adds_epi16(ps1, *t80);
 }
 
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput;
-
-static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
-                                                   const uint8_t *blt,
-                                                   const uint8_t *lt,
-                                                   const uint8_t *thr, int bd,
-                                                   PixelOutput pixel_output) {
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+    const unsigned char *lt, const unsigned char *thr, int bd) {
+  int i;
   __m128i blimit, limit, thresh;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
+  __m128i t80;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+  for (i = 0; i < 7; i++) {
+    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+  }
+  __m128i mask, hevhev;
+  __m128i p1p0, q1q0, abs_p1p0;
 
-  __m128i p[8], q[8];
-  load_highbd_pixel(s, 8, pitch, p, q);
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hevhev, &mask);
 
-  __m128i mask;
-  highbd_filter_mask(p, q, &limit, &blimit, &mask);
+  __m128i ps0ps1, qs0qs1;
+  // filter4
+  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
 
   __m128i flat, flat2;
-  const __m128i one = _mm_set1_epi16(1);
-  highbd_flat_mask4(&one, p, q, &flat, bd);
-  highbd_flat_mask5(&one, p, q, &flat2, bd);
+  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
 
   flat = _mm_and_si128(flat, mask);
   flat2 = _mm_and_si128(flat2, flat);
 
-  __m128i ps[2], qs[2];
-  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+  flat2 = _mm_unpacklo_epi64(flat2, flat2);
 
   // flat and wide flat calculations
-  __m128i flat_p[3], flat_q[3];
-  __m128i flat2_p[7], flat2_q[7];
+  __m128i flat_p[3], flat_q[3], flat_pq[3];
+  __m128i flat2_p[6], flat2_q[6];
+  __m128i flat2_pq[6];
   {
+    __m128i work0;
     const __m128i eight = _mm_set1_epi16(8);
     const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
+    flat2_q[0] =
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+    __m128i sum_p6, sum_p3;
+    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+    sum_q = _mm_sub_epi16(sum_p, p[5]);
+    sum_p = _mm_sub_epi16(sum_p, q[5]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+    flat2_p[1] = _mm_add_epi16(sum_p, work0);
+    flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+
+    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+    work0 = _mm_add_epi16(sum_p3, pq[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+    flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+    flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+    sum_p = _mm_sub_epi16(sum_p, q[4]);
+    sum_q = _mm_sub_epi16(sum_q, p[4]);
 
-    __m128i sum_p =
-        _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3]));
-    __m128i sum_q =
-        _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3]));
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+    flat2_p[2] = _mm_add_epi16(sum_p, work0);
+    flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
 
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+    work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[3]);
+    sum_q = _mm_sub_epi16(sum_q, p[3]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+    flat2_p[3] = _mm_add_epi16(sum_p, work0);
+    flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[2]);
+    sum_q = _mm_sub_epi16(sum_q, p[2]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+    flat2_p[4] = _mm_add_epi16(sum_p, work0);
+    flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[1]);
+    sum_q = _mm_sub_epi16(sum_q, p[1]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+    flat2_p[5] = _mm_add_epi16(sum_p, work0);
+    flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+  }
+
+  // highbd_filter8
+  pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+  pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+  for (i = 0; i < 3; i++) {
+    pq[i] = _mm_andnot_si128(flat, pq[i]);
+    flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+    pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+  }
+
+  // highbd_filter16
+  for (i = 5; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    pq[i] = _mm_andnot_si128(flat2, pq[i]);
+    flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+    //  get values for when (flat2 && flat && mask)
+    pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blt, const uint8_t *lt,
+                                       const uint8_t *thr, int bd) {
+  __m128i p[7], q[7], pq[7];
+  int i;
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
+  __m128i blimit, limit, thresh, t80;
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
+  __m128i mask;
+  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+  __m128i flat, flat2;
+  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+  __m128i ps[2], qs[2];
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+  // flat and wide flat calculations
+  __m128i flat_p[3], flat_q[3];
+  __m128i flat2_p[6], flat2_q[6];
+  {
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
     __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
     sum_p = _mm_add_epi16(sum_p, sum_lp);
-
     __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
     sum_q = _mm_add_epi16(sum_q, sum_lq);
     sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
     sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat2_p[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4);
-    flat2_q[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4);
+    flat2_p[0] = _mm_srli_epi16(
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                           _mm_add_epi16(p[1], q[0]))),
+        4);
+    flat2_q[0] = _mm_srli_epi16(
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                           _mm_add_epi16(p[0], q[1]))),
+        4);
     flat_p[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
     flat_q[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
-
-    __m128i sum_p7 = _mm_add_epi16(p[7], p[7]);
-    __m128i sum_q7 = _mm_add_epi16(q[7], q[7]);
+    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
     __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
     __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
-    sum_q = _mm_sub_epi16(sum_p, p[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[6]);
-    flat2_p[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4);
-    flat2_q[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4);
-
+    sum_q = _mm_sub_epi16(sum_p, p[5]);
+    sum_p = _mm_sub_epi16(sum_p, q[5]);
+    flat2_p[1] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+        4);
+    flat2_q[1] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+        4);
     sum_lq = _mm_sub_epi16(sum_lp, p[2]);
     sum_lp = _mm_sub_epi16(sum_lp, q[2]);
     flat_p[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
     flat_q[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
-    sum_p7 = _mm_add_epi16(sum_p7, p[7]);
-    sum_q7 = _mm_add_epi16(sum_q7, q[7]);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
     sum_p3 = _mm_add_epi16(sum_p3, p[3]);
     sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-
-    sum_p = _mm_sub_epi16(sum_p, q[5]);
-    sum_q = _mm_sub_epi16(sum_q, p[5]);
-    flat2_p[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4);
-    flat2_q[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4);
-
+    sum_p = _mm_sub_epi16(sum_p, q[4]);
+    sum_q = _mm_sub_epi16(sum_q, p[4]);
+    flat2_p[2] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+        4);
+    flat2_q[2] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+        4);
     sum_lp = _mm_sub_epi16(sum_lp, q[1]);
     sum_lq = _mm_sub_epi16(sum_lq, p[1]);
     flat_p[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
     flat_q[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
-    int i;
-    for (i = 3; i < 7; ++i) {
-      sum_p7 = _mm_add_epi16(sum_p7, p[7]);
-      sum_q7 = _mm_add_epi16(sum_q7, q[7]);
-      sum_p = _mm_sub_epi16(sum_p, q[7 - i]);
-      sum_q = _mm_sub_epi16(sum_q, p[7 - i]);
-      flat2_p[i] =
-          _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4);
-      flat2_q[i] =
-          _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4);
-    }
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[3]);
+    sum_q = _mm_sub_epi16(sum_q, p[3]);
+    flat2_p[3] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+        4);
+    flat2_q[3] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+        4);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[2]);
+    sum_q = _mm_sub_epi16(sum_q, p[2]);
+    flat2_p[4] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+        4);
+    flat2_q[4] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+        4);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[1]);
+    sum_q = _mm_sub_epi16(sum_q, p[1]);
+    flat2_p[5] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+        4);
+    flat2_q[5] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+        4);
   }
-
   // highbd_filter8
   p[2] = _mm_andnot_si128(flat, p[2]);
   //  p2 remains unchanged if !(flat && mask)
@@ -320,7 +633,6 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
   q[2] = _mm_andnot_si128(flat, q[2]);
   flat_q[2] = _mm_and_si128(flat, flat_q[2]);
   q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-
   int i;
   for (i = 1; i >= 0; i--) {
     ps[i] = _mm_andnot_si128(flat, ps[i]);
@@ -330,675 +642,979 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
     flat_q[i] = _mm_and_si128(flat, flat_q[i]);
     q[i] = _mm_or_si128(qs[i], flat_q[i]);
   }
-
   // highbd_filter16
-
-  if (pixel_output == FOUR_PIXELS) {
-    for (i = 6; i >= 0; i--) {
-      //  p[i] remains unchanged if !(flat2 && flat && mask)
-      p[i] = _mm_andnot_si128(flat2, p[i]);
-      flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-      //  get values for when (flat2 && flat && mask)
-      p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-
-      q[i] = _mm_andnot_si128(flat2, q[i]);
-      flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-      q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]);
-      _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]);
-    }
-  } else {  // EIGHT_PIXELS
-    for (i = 6; i >= 0; i--) {
-      //  p[i] remains unchanged if !(flat2 && flat && mask)
-      p[i] = _mm_andnot_si128(flat2, p[i]);
-      flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-      //  get values for when (flat2 && flat && mask)
-      p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-
-      q[i] = _mm_andnot_si128(flat2, q[i]);
-      flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-      q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-      _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
-    }
+  for (i = 5; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    p[i] = _mm_andnot_si128(flat2, p[i]);
+    flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+    //  get values for when (flat2 && flat && mask)
+    p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+    q[i] = _mm_andnot_si128(flat2, q[i]);
+    flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+    q[i] = _mm_or_si128(q[i], flat2_q[i]);
   }
 }
 
-// Note:
-//  highbd_lpf_horz_edge_8_8p() output 8 pixels per register
-//  highbd_lpf_horz_edge_8_4p() output 4 pixels per register
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch,
-                                             const uint8_t *blt,
-                                             const uint8_t *lt,
-                                             const uint8_t *thr, int bd) {
-  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS);
-}
-#endif  // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-
-static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch,
-                                             const uint8_t *blt,
-                                             const uint8_t *lt,
-                                             const uint8_t *thr, int bd) {
-  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS);
-}
-
-void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
-                                           const uint8_t *_blimit,
-                                           const uint8_t *_limit,
-                                           const uint8_t *_thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
-#else
-  highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
-#endif
-}
-
-void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
-#else
-  highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
-  highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd);
-#endif
-}
-
-static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
-                                      const __m128i *p0, const __m128i *q0,
-                                      const __m128i *q1, const __m128i *q2,
-                                      int p, uint16_t *s) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
-#else
-  _mm_store_si128((__m128i *)(s - 3 * p), *p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), *p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), *p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), *q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), *q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), *q2);
-#endif
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+  }
 }
 
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
   __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i workp_a, workp_b, workp_shft;
+  __m128i pq[3];
+  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+  __m128i flat_p1p0, flat_q0q1;
 
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
   __m128i t80;
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  __m128i ps1, ps0, qs0, qs1;
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
-  if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
-    t80 = _mm_set1_epi16(0x80);
-  } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
-    t80 = _mm_set1_epi16(0x200);
-  } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
-    t80 = _mm_set1_epi16(0x800);
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
   }
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
 
-  ps1 = _mm_subs_epi16(p1, t80);
-  ps0 = _mm_subs_epi16(p0, t80);
-  qs0 = _mm_subs_epi16(q0, t80);
-  qs1 = _mm_subs_epi16(q1, t80);
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  // filter_mask and hev_mask
-  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+}
 
-  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+    const unsigned char *_thresh0, const unsigned char *_blimit1,
+    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat, work;
+  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+  __m128i op1, op0, oq0, oq1;
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p2p1 = abs_diff16(*p2, *p1);
+  abs_p1p0 = abs_diff16(*p1, *p0);
+  abs_q1q0 = abs_diff16(*q1, *q0);
+  abs_q2q1 = abs_diff16(*q2, *q1);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  mask = _mm_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
-      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
-      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  mask = _mm_max_epi16(abs_q2q1, mask);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_max_epi16(mask, abs_p2p1);
+  mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+  flat = _mm_max_epi16(flat, work);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    op1 = _mm_srli_epi16(workp_shft0, 3);
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+    op0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    oq1 = _mm_srli_epi16(workp_shft1, 3);
+  }
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  qs[0] = _mm_andnot_si128(flat, qs[0]);
+  oq0 = _mm_and_si128(flat, oq0);
+  *q0 = _mm_or_si128(qs[0], oq0);
+
+  qs[1] = _mm_andnot_si128(flat, qs[1]);
+  oq1 = _mm_and_si128(flat, oq1);
+  *q1 = _mm_or_si128(qs[1], oq1);
+
+  ps[0] = _mm_andnot_si128(flat, ps[0]);
+  op0 = _mm_and_si128(flat, op0);
+  *p0 = _mm_or_si128(ps[0], op0);
+
+  ps[1] = _mm_andnot_si128(flat, ps[1]);
+  op1 = _mm_and_si128(flat, op1);
+  *p1 = _mm_or_si128(ps[1], op1);
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+                             _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    const unsigned char *_blimit, const unsigned char *_limit,
+    const unsigned char *_thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[4];
+  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+  __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+  __m128i abs_p1p0;
+
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
   // flat_mask4
-  flat = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
-      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
-      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
-  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
   flat = _mm_max_epi16(abs_p1p0, flat);
-  flat = _mm_max_epi16(abs_q1q0, flat);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
 
-  if (bd == 8)
-    flat = _mm_subs_epu16(flat, one);
-  else if (bd == 10)
-    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
-  else  // bd == 12
-    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 
   flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
 
-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  }
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
 
-  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+  work_a = _mm_andnot_si128(flat, *q2);
+  *q2 = _mm_and_si128(flat, oq2);
+  *q2 = _mm_or_si128(work_a, *q2);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+  work_a = _mm_andnot_si128(flat, *p2);
+  *p2 = _mm_and_si128(flat, op2);
+  *p2 = _mm_or_si128(work_a, *p2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+    const unsigned char *_limit0, const unsigned char *_thresh0,
+    const unsigned char *_blimit1, const unsigned char *_limit1,
+    const unsigned char *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat;
+  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
+
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+  work1 =
+      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
+  work0 = _mm_max_epi16(work0, work1);
+  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+  work2 = _mm_max_epi16(work2, work0);
+  mask = _mm_max_epi16(work2, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+  flat = _mm_max_epi16(work1, flat);
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+  flat = _mm_max_epi16(work0, flat);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  {
+    __m128i workp_a, workp_b;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  }
 
   // lp filter
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
 
-  filt = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
+  qs[0] = _mm_andnot_si128(flat, qs[0]);
+  oq0 = _mm_and_si128(flat, oq0);
+  *q0 = _mm_or_si128(qs[0], oq0);
 
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, mask);
+  qs[1] = _mm_andnot_si128(flat, qs[1]);
+  oq1 = _mm_and_si128(flat, oq1);
+  *q1 = _mm_or_si128(qs[1], oq1);
 
-  filter1 = _mm_adds_epi16(filt, t4);
-  filter2 = _mm_adds_epi16(filt, t3);
+  ps[0] = _mm_andnot_si128(flat, ps[0]);
+  op0 = _mm_and_si128(flat, op0);
+  *p0 = _mm_or_si128(ps[0], op0);
 
-  // Filter1 >> 3
-  pixel_clamp(&pmin, &pmax, &filter1);
-  filter1 = _mm_srai_epi16(filter1, 3);
+  ps[1] = _mm_andnot_si128(flat, ps[1]);
+  op1 = _mm_and_si128(flat, op1);
+  *p1 = _mm_or_si128(ps[1], op1);
 
-  // Filter2 >> 3
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter2 = _mm_srai_epi16(filter2, 3);
+  work_a = _mm_andnot_si128(flat, *q2);
+  *q2 = _mm_and_si128(flat, oq2);
+  *q2 = _mm_or_si128(work_a, *q2);
 
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(hev, filt);
-
-  work_a = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  q0 = _mm_load_si128((__m128i *)flat_oq0);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q0 = _mm_and_si128(flat, q0);
-  q0 = _mm_or_si128(work_a, q0);
-
-  work_a = _mm_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  q1 = _mm_load_si128((__m128i *)flat_oq1);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q1 = _mm_and_si128(flat, q1);
-  q1 = _mm_or_si128(work_a, q1);
-
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q2 = _mm_load_si128((__m128i *)flat_oq2);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q2 = _mm_and_si128(flat, q2);
-  q2 = _mm_or_si128(work_a, q2);
-
-  work_a = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  p0 = _mm_load_si128((__m128i *)flat_op0);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p0 = _mm_and_si128(flat, p0);
-  p0 = _mm_or_si128(work_a, p0);
-
-  work_a = _mm_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  p1 = _mm_load_si128((__m128i *)flat_op1);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p1 = _mm_and_si128(flat, p1);
-  p1 = _mm_or_si128(work_a, p1);
-
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p2 = _mm_load_si128((__m128i *)flat_op2);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p2 = _mm_and_si128(flat, p2);
-  p2 = _mm_or_si128(work_a, p2);
-
-  store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s);
+  work_a = _mm_andnot_si128(flat, *p2);
+  *p2 = _mm_and_si128(flat, op2);
+  *p2 = _mm_or_si128(work_a, *p2);
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+                             &p1p0, _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 }
 
 void aom_highbd_lpf_horizontal_8_dual_sse2(
     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     const uint8_t *_thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+                                  _blimit0, _limit0, _thresh0, _blimit1,
+                                  _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+    const uint8_t *_thresh, int bd) {
   __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-#endif
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-#endif
-  const __m128i abs_p1p0 =
-      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  const __m128i abs_q1q0 =
-      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  __m128i mask, hev;
+  __m128i p1p0, q1q0;
+  __m128i pq[2];
+
+  __m128i abs_p1p0;
+
+  __m128i t80;
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i mask, flat;
+  __m128i p[2], q[2];
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
   const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
   const __m128i one = _mm_set1_epi16(1);
-  __m128i abs_p0q0 =
-      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  __m128i abs_p1q1 =
-      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
 
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
   __m128i t80;
-  __m128i tff80;
-  __m128i tffe0;
-  __m128i t1f;
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  __m128i t7f;
-  // equivalent to shifting 0x7f left by bitdepth - 8
-  // and setting new bits to 1
-  __m128i ps1, ps0, qs0, qs1;
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
-  if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
-    t80 = _mm_set1_epi16(0x80);
-    tff80 = _mm_set1_epi16(0xff80);
-    tffe0 = _mm_set1_epi16(0xffe0);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
-  } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
-    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
-  } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
-    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
-  }
 
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
 
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
   mask = _mm_max_epi16(flat, mask);
 
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
-      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
-      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
-  mask = _mm_max_epi16(work, mask);
-#endif
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
-  // filter4
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
-
-  filt = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  pixel_clamp(&pmin, &pmax, &filt);
-
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm_and_si128(filt, mask);
-
-  filter1 = _mm_adds_epi16(filt, t4);
-  pixel_clamp(&pmin, &pmax, &filter1);
-
-  filter2 = _mm_adds_epi16(filt, t3);
-  pixel_clamp(&pmin, &pmax, &filter2);
-
-  // Filter1 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm_srli_epi16(filter1, 3);
-  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
-  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
-  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+  p[0] = *p0;
+  p[1] = *p1;
+  q[0] = *q0;
+  q[1] = *q1;
 
-  // Filter2 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter2);
-  filter2 = _mm_srli_epi16(filter2, 3);
-  work_a = _mm_and_si128(work_a, tffe0);
-  filter2 = _mm_and_si128(filter2, t1f);
-  filter2 = _mm_or_si128(filter2, work_a);
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
 
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  work_a = _mm_cmpgt_epi16(zero, filt);
-  filt = _mm_srli_epi16(filt, 1);
-  work_a = _mm_and_si128(work_a, tff80);
-  filt = _mm_and_si128(filt, t7f);
-  filt = _mm_or_si128(filt, work_a);
-
-  filt = _mm_andnot_si128(hev, filt);
-
-  q0 = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &q0);
-  q0 = _mm_adds_epi16(q0, t80);
-
-  q1 = _mm_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &q1);
-  q1 = _mm_adds_epi16(q1, t80);
-
-  p0 = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &p0);
-  p0 = _mm_adds_epi16(p0, t80);
-
-  p1 = _mm_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &p1);
-  p1 = _mm_adds_epi16(p1, t80);
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-#else
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-#endif
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p1p0, q1q0;
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+                             _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 }
 
 void aom_highbd_lpf_horizontal_4_dual_sse2(
     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     const uint8_t *_thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i ps[2], qs[2];
+
+  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
 }
 
 void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
-  uint16_t *src[1];
-  uint16_t *dst[1];
+  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+  __m128i p1p0, q1q0;
+  __m128i p1, q1;
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+                             thresh, bd);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  // transpose from 8x4 to 4x8
+  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
 
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 }
 
 void aom_highbd_lpf_vertical_4_dual_sse2(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i ps[2], qs[2];
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                               &d2, &d3);
+
+  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+                                  thresh0, blimit1, limit1, thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+                               &d3, &d4, &d5, &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x3, x2, x1, x0, p0, q0;
+  __m128i p1p0, q1q0;
+
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+                             limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0, p1, q1, p2, q2;
+
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+                           &p0, &q0, &q1, &q2, &d6, &d7);
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
 }
 
 void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
-  uint16_t *src[1];
-  uint16_t *dst[1];
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p2, p1, p0, p3, q0;
+  __m128i q1q0, p1p0;
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
 
   // Loop filtering
-  aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+                             &p1p0, blimit, limit, thresh, bd);
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+                               &d1, &d2, &d3);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
 }
 
 void aom_highbd_lpf_vertical_8_dual_sse2(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                           &d2, &d3, &d4, &d5, &d6, &d7);
+
+  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+                                  blimit0, limit0, thresh0, blimit1, limit1,
+                                  thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+                           &x2, &x3, &x4, &x5, &x6, &x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  __m128i q[7], p[7], pq[7];
+  __m128i p6, p5, p4, p3;
+  __m128i p6_2, p5_2, p4_2, p3_2;
+  __m128i d0, d1, d2, d3;
+  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+                               &p[3], &p[2], &p[1], &p[0]);
 
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 
-void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+                               &q[3], &q[4], &q[5], &q[6], &d7_2);
 
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 
-  // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
-                                        bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
-
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
-}
-
-void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
-
-  //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd);
-#else
-  aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, bd);
-#endif
-  //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  q[0] = _mm_srli_si128(pq[0], 8);
+  q[1] = _mm_srli_si128(pq[1], 8);
+  q[2] = _mm_srli_si128(pq[2], 8);
+  q[3] = _mm_srli_si128(pq[3], 8);
+  q[4] = _mm_srli_si128(pq[4], 8);
+  q[5] = _mm_srli_si128(pq[5], 8);
+
+  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index 2bbf15ef2..dea113a29 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
index 855bc6558..e0d22522d 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -288,11 +288,9 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SADNXN4D  4, 16
 HIGH_SADNXN4D 16,  4
 HIGH_SADNXN4D  8, 32
 HIGH_SADNXN4D 32,  8
 HIGH_SADNXN4D 16, 64
 HIGH_SADNXN4D 64, 16
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
index 760e68aab..3398d8a2a 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -158,10 +158,8 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -230,10 +228,8 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
 HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -302,12 +298,10 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
 HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
 HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -376,7 +370,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index ee19796e3..61f5b8e86 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -94,7 +94,7 @@ SECTION .text
 %define filter_idx_shift 5
 
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
@@ -102,19 +102,20 @@ SECTION .text
                                       sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse, \
+                                        g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -133,8 +134,9 @@ SECTION .text
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                x_offset, y_offset, dst, dst_stride, height, \
-                                sse, g_bilin_filter, g_pw_8
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse, \
+                                    g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
@@ -153,22 +155,16 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                              x_offset, y_offset, dst, dst_stride, height, sse
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
       %define block_height heightd
     %endif
 
@@ -287,14 +283,14 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -311,7 +307,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -514,14 +510,14 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -538,7 +534,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -636,14 +632,14 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -660,7 +656,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -735,14 +731,14 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -759,7 +755,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -862,8 +858,8 @@ SECTION .text
 
 .x_nonhalf_y_nonhalf:
 ; loading filter - this is same as in 8-bit depth
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
   shl           y_offsetd, filter_idx_shift
@@ -872,7 +868,7 @@ SECTION .text
   mova                 m9, [bilin_filter+x_offsetq+16]
   mova                m10, [bilin_filter+y_offsetq]
   mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -900,7 +896,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 ; end of load filter
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
index befd81269..18eb03d12 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -13,8 +13,8 @@
 #include <emmintrin.h>
 #include <stddef.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
                                     const uint16_t *src, ptrdiff_t src_stride,
@@ -204,21 +204,15 @@ SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
 SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
 SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
 SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
-#if CONFIG_EXT_PARTITION
 SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
 SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
 SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
-#endif
 SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
 SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
 SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
 SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
 SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
 SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
-#if CONFIG_EXT_PARTITION
-SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); }
-SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); }
-#endif
 
 static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
   if (rows == 4) {
@@ -244,25 +238,17 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
     if (cols == 16) return subtract_16x32;
     if (cols == 32) return subtract_32x32;
     if (cols == 64) return subtract_64x32;
-#if CONFIG_EXT_PARTITION
-    if (cols == 128) return subtract_128x32;
-#endif  // CONFIG_EXT_PARTITION
   }
   if (rows == 64) {
     if (cols == 16) return subtract_16x64;
     if (cols == 32) return subtract_32x64;
     if (cols == 64) return subtract_64x64;
-#if CONFIG_EXT_PARTITION
     if (cols == 128) return subtract_128x64;
-#endif  // CONFIG_EXT_PARTITION
   }
-#if CONFIG_EXT_PARTITION
   if (rows == 128) {
-    if (cols == 32) return subtract_32x128;
     if (cols == 64) return subtract_64x128;
     if (cols == 128) return subtract_128x128;
   }
-#endif  // CONFIG_EXT_PARTITION
   assert(0);
   return NULL;
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
index cf8ea498c..0d954e178 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -14,6 +14,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;unsigned int aom_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 62acf3ed3..fdfadc886 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -12,13 +12,17 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -185,13 +189,11 @@ VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
-#if CONFIG_EXT_PARTITION_TYPES
 VAR_FN(16, 4, 16, 6);
 VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 16, 8);
+VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
 VAR_FN(64, 16, 16, 10);
-#endif
 
 #undef VAR_FN
 
@@ -398,7 +400,6 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
 #define FNS(opt)                        \
   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
@@ -416,20 +417,6 @@ DECLS(sse2);
   FN(32, 8, 16, 5, 3, opt, (int64_t));  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)); \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
-#else
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t))
-#endif
 
 FNS(sse2);
 
@@ -577,7 +564,6 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
 #define FNS(opt)                        \
   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
@@ -595,30 +581,104 @@ DECLS(sse2);
   FN(32, 8, 16, 5, 3, opt, (int64_t));  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)); \
   FN(64, 16, 16, 6, 4, opt, (int64_t));
-#else
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
-#endif
 
 FNS(sse2);
 
 #undef FNS
 #undef FN
 
-void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint16_t *comp_pred, int width, int height,
                                     int subpel_x_q3, int subpel_y_q3,
                                     const uint8_t *ref8, int ref_stride,
                                     int bd) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     if (width >= 8) {
@@ -648,54 +708,48 @@ void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
         ref += 2 * ref_stride;
       }
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                               width, kernel, 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                              width, NULL, -1, kernel, 16, width, height, bd);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      aom_highbd_convolve8_horiz(ref8, ref_stride,
-                                 CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
-                                 16, NULL, -1, width, height, bd);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
-                                width, NULL, -1, kernel, 16, width, height, bd);
-    } else {
-      DECLARE_ALIGNED(16, uint16_t,
-                      temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
-                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                 intermediate_height, bd);
-      aom_highbd_convolve8_vert(
-          CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
-          MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
-          16, width, height, bd);
-    }
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
+        16, width, height, bd);
   }
 }
 
-void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
-                                             const uint8_t *pred8, int width,
-                                             int height, int subpel_x_q3,
-                                             int subpel_y_q3,
-                                             const uint8_t *ref8,
-                                             int ref_stride, int bd) {
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
   int i;
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
   assert(!(width * height & 7));
   n = width * height >> 3;
@@ -707,3 +761,102 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
     pred += 8;
   }
 }
+
+static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                               const __m128i *w0,
+                                               const __m128i *w1,
+                                               const __m128i *r,
+                                               void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
+                                       const uint8_t *pred8, int width,
+                                       int height, const uint8_t *ref8,
+                                       int ref_stride,
+                                       const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // Read 8 pixels one row at a time
+    assert(!(width & 7));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 8) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+        comp_pred += 8;
+        pred += 8;
+        ref += 8;
+      }
+      ref += ref_stride - width;
+    }
+  } else {
+    // Read 4 pixels two rows at a time
+    assert(!(width & 3));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+      comp_pred += 8;
+      pred += 8;
+      ref += 2 * ref_stride;
+    }
+  }
+}
+
+void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+    comp_pred += 8;
+    pred += 8;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
index cc7f52811..6c247a91b 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -11,8 +11,8 @@
 
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
index 6b8922b8c..1e67d392e 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -11,7 +11,20 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y0 = _mm256_sad_epu8(x0, zero);
+  __m256i y1 = _mm256_sad_epu8(x1, zero);
+  y0 = _mm256_add_epi64(y0, y1);
+  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+  y0 = _mm256_add_epi64(u0, y0);
+  u0 = _mm256_unpackhi_epi64(y0, y0);
+  return _mm256_add_epi16(y0, u0);
+}
 
 static INLINE __m256i dc_sum_32(const uint8_t *ref) {
   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
@@ -25,13 +38,31 @@ static INLINE __m256i dc_sum_32(const uint8_t *ref) {
 
 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
                                   ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
+  for (int i = 0; i < height; ++i) {
     _mm256_storeu_si256((__m256i *)dst, *r);
     dst += stride;
   }
 }
 
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m256i sum_above = dc_sum_32(above);
@@ -168,11 +199,58 @@ void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   uint32_t sum = _mm_cvtsi128_si32(left_sum);
   sum += 24;
   sum /= 48;
-
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 64;
+  sum /= 128;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 40;
+  sum /= 80;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -187,6 +265,62 @@ void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
@@ -202,6 +336,63 @@ void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -211,6 +402,42 @@ void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
@@ -218,8 +445,39 @@ void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
-// TM_PRED
+// PAETH_PRED
 
 // Return 16 16-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
@@ -336,6 +594,26 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 32 8-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
                                       const __m256i *top1,
@@ -411,3 +689,123 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     rep = _mm256_add_epi16(rep, one);
   }
 }
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  const __m256i l = get_left_vector(left);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
index 2a83b9001..5b2452c8e 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -11,11 +11,11 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
-static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < 4; ++i) {
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  for (int i = 0; i < height; i += 2) {
     *(uint32_t *)dst = dc;
     dst += stride;
     *(uint32_t *)dst = dc;
@@ -51,6 +51,17 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
@@ -83,6 +94,34 @@ static INLINE __m128i dc_sum_32(const uint8_t *ref) {
   return _mm_add_epi16(x0, high);
 }
 
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x2 = _mm_sad_epu8(x2, zero);
+  x3 = _mm_sad_epu8(x3, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  x2 = _mm_add_epi16(x2, x3);
+  x0 = _mm_add_epi16(x0, x2);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> DC_SHIFT2;
+}
+
 // -----------------------------------------------------------------------------
 // DC_PRED
 
@@ -94,11 +133,26 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 6;
-  sum /= 12;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -109,7 +163,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 6;
-  sum /= 12;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_8xh(&row, 4, dst, stride);
@@ -123,11 +177,37 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 12;
-  sum /= 24;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_8(left);
@@ -136,7 +216,7 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 12;
-  sum /= 24;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_16xh(&row, 8, dst, stride);
 }
@@ -149,11 +229,37 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 24;
-  sum /= 48;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_64(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_8(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_32(above);
@@ -162,11 +268,63 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 24;
-  sum /= 48;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 64;
+  sum /= 128;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_32(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_16(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_TOP
 
@@ -181,7 +339,21 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   sum_above = _mm_packus_epi16(sum_above, sum_above);
 
   const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -208,6 +380,31 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
@@ -235,6 +432,33 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -249,6 +473,62 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_LEFT
 
@@ -263,7 +543,22 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   sum_left = _mm_packus_epi16(sum_left, sum_left);
 
   const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -291,6 +586,33 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -319,6 +641,34 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
@@ -333,6 +683,62 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_128
 
@@ -341,7 +747,15 @@ void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   (void)above;
   (void)left;
   const uint32_t pred = 0x80808080;
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -360,6 +774,22 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
@@ -377,6 +807,23 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -386,6 +833,42 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // V_PRED
 
@@ -393,7 +876,14 @@ void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const uint32_t pred = *(uint32_t *)above;
   (void)left;
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -410,6 +900,20 @@ void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const __m128i row = _mm_load_si128((__m128i const *)above);
@@ -424,19 +928,75 @@ void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
   const __m128i row0 = _mm_load_si128((__m128i const *)above);
   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  int i;
-  for (i = 0; i < 16; ++i) {
+  v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  for (int i = 0; i < height; ++i) {
     _mm_store_si128((__m128i *)dst, row0);
     _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
     dst += stride;
   }
 }
 
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 16);
+}
+
 // -----------------------------------------------------------------------------
 // H_PRED
 
@@ -471,25 +1031,7 @@ void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
 }
 
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)above;
   const __m128i left_col = _mm_load_si128((__m128i const *)left);
@@ -500,13 +1042,13 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
@@ -514,26 +1056,26 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   row0 = _mm_shufflelo_epi16(left_col_high, 0);
   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
@@ -541,6 +1083,24 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
   _mm_storel_epi64((__m128i *)dst, row0);
   dst += stride;
   _mm_storel_epi64((__m128i *)dst, row1);
@@ -550,6 +1110,82 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   _mm_storel_epi64((__m128i *)dst, row3);
 }
 
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int count) {
+  (void)above;
+  for (int i = 0; i < count; ++i) {
+    const __m128i left_col = _mm_load_si128((__m128i const *)left);
+    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+    row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+    left += 16;
+  }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
   int i;
@@ -601,6 +1237,14 @@ static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
   h_pred_store_16xh(row, 4, dst, stride);
 }
 
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)above;
@@ -611,29 +1255,38 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
   h_prediction_16x8_2(&left_col_8p, dst, stride);
 }
 
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int count) {
   int i = 0;
-
   do {
-    left_col = _mm_load_si128((const __m128i *)left);
-    left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p, dst, stride);
+    const __m128i left_col = _mm_load_si128((const __m128i *)left);
+    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
     dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p, dst, stride);
+    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
     dst += stride << 2;
 
-    left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p, dst, stride);
+    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
     dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p, dst, stride);
+    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
     dst += stride << 2;
 
     left += 16;
     i++;
-  } while (i < 2);
+  } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 4);
 }
 
 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
@@ -664,6 +1317,19 @@ static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
   h_pred_store_32xh(row, 4, dst, stride);
 }
 
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   __m128i left_col, left_col_8p;
@@ -682,3 +1348,83 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dst += stride << 2;
   h_prediction_32x8_2(&left_col_8p, dst, stride);
 }
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + 32), r0);
+    _mm_store_si128((__m128i *)(dst + 48), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
index 85b82744e..807ed1770 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -11,11 +11,12 @@
 
 #include <tmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/intrapred_common.h"
 
 // -----------------------------------------------------------------------------
-// TM_PRED
+// PAETH_PRED
 
 // Return 8 16-bit pixels in one row
 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
@@ -82,6 +83,26 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -145,6 +166,28 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 16 8-bit pixels in one row
 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
                                       const __m128i *top1,
@@ -154,6 +197,27 @@ static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
   return _mm_packus_epi16(p0, p1);
 }
 
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -234,6 +298,57 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  __m128i l16;
+
+  for (int i = 0; i < 8; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -307,6 +422,162 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r32l);
+      _mm_store_si128((__m128i *)(dst + 16), r32h);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  const __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 // -----------------------------------------------------------------------------
 // SMOOTH_PRED
 
@@ -315,9 +586,15 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 // pixels[2]: right_pred vector
 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
   pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-  pixels[1] = _mm_loadl_epi64((const __m128i *)left);
 
   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
@@ -325,45 +602,52 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
   pixels[0] = _mm_unpacklo_epi16(d, bp);
 }
 
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vecotr
-// weights[2]: weights_w and scale - weights_w interleave vector
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weights) {
-  __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+                                  __m128i *weight_h, __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-
-  weights[0] = _mm_unpacklo_epi8(t, zero);
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  weights[1] = _mm_sub_epi16(d, weights[0]);
-  weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+  weight_h[0] = _mm_unpacklo_epi8(t, zero);
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
 
   if (height == 8) {
-    t = _mm_srli_si128(t, 4);
-    weights[0] = _mm_unpacklo_epi8(t, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
   }
 }
 
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight,
-                                   int h, uint8_t *dst, ptrdiff_t stride) {
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
-  int i;
-  for (i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
 
     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
     b = _mm_unpacklo_epi16(b, pixel[2]);
-    __m128i sum = _mm_madd_epi16(b, weight[2]);
+    __m128i sum = _mm_madd_epi16(b, ww[0]);
 
     sum = _mm_add_epi32(s, sum);
     sum = _mm_add_epi32(sum, round);
@@ -383,10 +667,10 @@ void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   __m128i pixels[3];
   load_pixel_w4(above, left, 4, pixels);
 
-  __m128i weights[3];
-  load_weight_w4(sm_weight_arrays, 4, weights);
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
 
-  smooth_pred_4xh(pixels, weights, 4, dst, stride);
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
 }
 
 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
@@ -394,33 +678,68 @@ void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   __m128i pixels[3];
   load_pixel_w4(above, left, 8, pixels);
 
-  __m128i weights[3];
-  load_weight_w4(sm_weight_arrays, 8, weights);
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
 
-  smooth_pred_4xh(pixels, weights, 8, dst, stride);
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
 // pixels[0]: above and below_pred interleave vector, first half
 // pixels[1]: above and below_pred interleave vector, second half
 // pixels[2]: left vector
 // pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-  pixels[2] = _mm_load_si128((const __m128i *)left);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
-
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
   pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[7] = pixels[3];
+  }
 }
 
 // weight_h[0]: weight_h vector
 // weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
@@ -429,7 +748,6 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
   const int we_offset = height < 8 ? 4 : 8;
   __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
   weight_h[0] = _mm_unpacklo_epi8(we, zero);
-
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 
@@ -450,6 +768,19 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(we, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
   }
 }
 
@@ -531,355 +862,831 @@ void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
-// pixels[0]: above and below_pred interleave vector, 1/4
-// pixels[1]: above and below_pred interleave vector, 2/4
-// pixels[2]: above and below_pred interleave vector, 3/4
-// pixels[3]: above and below_pred interleave vector, 3/4
-// pixels[4]: left vector
-// pixels[5]: left vector, h = 32 only
-// pixels[6]: right_pred vector
-static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab = _mm_load_si128((const __m128i *)above);
-  pixels[6] = _mm_set1_epi16((uint16_t)above[15]);
-  pixels[4] = _mm_load_si128((const __m128i *)left);
-  pixels[5] = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  __m128i wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i top_right =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
+    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
+      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
+
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      const __m128i scale_m_weights_x =
+          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
+      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
+      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
+      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
+      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
+
+      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
+      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
+
+      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
+      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
 
-  __m128i x = _mm_unpacklo_epi8(ab, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
 
-  x = _mm_unpackhi_epi8(ab, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
 }
 
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// ... ...
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-// ... ...
-static INLINE void load_weight_w16(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
 
-  if (height == 8) {
-    weight_h[0] = _mm_unpacklo_epi8(w8, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);  // scale - weight_h
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
 
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// pixels[0]: above and below_pred interleave vector
+static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weights[0]: weights_h vector
+// weights[1]: scale - weights_h vector
+static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height == 4) {
+    const __m128i weight =
+        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(d, weights[2]);
   }
+}
 
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i d = _mm_set1_epi16(0x100);
 
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+    d = _mm_add_epi16(d, inc);
   }
+}
 
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 4, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 8, &pixels);
 
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 8, weights);
 
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  load_weight_v_w4(sm_weight_arrays, 16, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_h) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height < 16) {
+    const int offset = height < 8 ? 4 : 8;
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
   }
 }
 
-static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
-  const __m128i one = _mm_set1_epi16(1);
+static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                     int h, uint8_t *dst, ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[4] : pixels[5];
+  __m128i d = _mm_set1_epi16(0x100);
 
-  int i;
-  for (i = 0; i < 8; ++i) {
+  for (int i = 0; i < h; ++i) {
     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-    __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc);
-    __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc);
 
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[6]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-    __m128i sum2 = _mm_madd_epi16(b, ww[2]);
-    __m128i sum3 = _mm_madd_epi16(b, ww[3]);
+    s0 = _mm_add_epi32(s0, pred_round);
+    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
 
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    s2 = _mm_add_epi32(s2, sum2);
-    s2 = _mm_add_epi32(s2, round);
-    s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale);
-
-    s3 = _mm_add_epi32(s3, sum3);
-    s3 = _mm_add_epi32(s3, round);
-    s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    sum1 = _mm_packus_epi16(s2, s3);
-    sum1 = _mm_shuffle_epi8(sum1, gat);
-
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    _mm_storel_epi64((__m128i *)(dst + 8), sum1);
+    s1 = _mm_add_epi32(s1, pred_round);
+    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
 
+    __m128i sum01 = _mm_packus_epi16(s0, s1);
+    sum01 = _mm_shuffle_epi8(sum01, gat);
+    _mm_storel_epi64((__m128i *)dst, sum01);
     dst += stride;
-    rep = _mm_add_epi16(rep, one);
+
     d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 8, pixels);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 4, pixels);
 
-  __m128i wh[2], ww[4];
-  load_weight_w16(sm_weight_arrays, 8, wh, ww);
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 4, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
 }
 
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 16, pixels);
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 8, pixels);
 
-  __m128i wh[4], ww[4];
-  load_weight_w16(sm_weight_arrays, 16, wh, ww);
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 8, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 16, pixels);
+
+  __m128i wh[4];
+  load_weight_v_w8(sm_weight_arrays, 16, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
 }
 
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 32, pixels);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 32, pixels);
 
-  __m128i wh[8], ww[4];
-  load_weight_w16(sm_weight_arrays, 32, wh, ww);
+  __m128i wh[8];
+  load_weight_v_w8(sm_weight_arrays, 32, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2);
+  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
+  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
 }
 
-static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab0 = _mm_load_si128((const __m128i *)above);
-  __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16));
+static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i bottom_left =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round =
+      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i scale_m_weights_y =
+        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      // 8 -> 16
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
+      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
+      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
+      // top_x * weights_y + scale_m_weights_y * bottom_left
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      pred_lo = _mm_add_epi32(pred_lo, round);
+      pred_hi = _mm_add_epi32(pred_hi, round);
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
 
-  pixels[10] = _mm_set1_epi16((uint16_t)above[31]);
-  pixels[8] = _mm_load_si128((const __m128i *)left);
-  pixels[9] = _mm_load_si128((const __m128i *)(left + 16));
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
+}
 
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
+}
 
-  __m128i x = _mm_unpacklo_epi8(ab0, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
+}
 
-  x = _mm_unpackhi_epi8(ab0, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
+}
 
-  x = _mm_unpacklo_epi8(ab1, zero);
-  pixels[4] = _mm_unpacklo_epi16(x, bp);
-  pixels[5] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
+}
 
-  x = _mm_unpackhi_epi8(ab1, zero);
-  pixels[6] = _mm_unpacklo_epi16(x, bp);
-  pixels[7] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
 }
 
-static INLINE void load_weight_w32(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  if (height == 4)
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
+  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+}
+
+// weights[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  (void)height;
+  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
   const __m128i zero = _mm_setzero_si128();
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+
+  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
+  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+}
 
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = _mm_set1_epi16(0x8000);
 
-    __m128i x = _mm_unpacklo_epi8(w32_0, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
+    b = _mm_unpacklo_epi16(b, pixel[1]);
+    __m128i sum = _mm_madd_epi16(b, weight[0]);
 
-    x = _mm_unpackhi_epi8(w32_0, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
 
-    x = _mm_unpacklo_epi8(w32_1, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[4] = _mm_unpacklo_epi16(x, y);
-    weight_w[5] = _mm_unpackhi_epi16(x, y);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
 
-    x = _mm_unpackhi_epi8(w32_1, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[6] = _mm_unpacklo_epi16(x, y);
-    weight_w[7] = _mm_unpackhi_epi16(x, y);
+    rep = _mm_add_epi16(rep, one);
   }
+}
 
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 4, pixels);
 
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 4, &weights);
 
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
+}
 
-    weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]);
-    weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]);
-    weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]);
-    weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 8, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 16, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  dst += stride << 3;
+
+  pixels[0] = _mm_srli_si128(pixels[0], 8);
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+// pixels[2]: left vector + 16
+// pixels[3]: right_pred vector
+static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[3] = pixels[1];
   }
 }
 
-static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_w) {
+  (void)height;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+}
+
+static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
+                                     int h, uint8_t *dst, ptrdiff_t stride,
+                                     int second_half) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[8] : pixels[9];
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
 
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
+    b = _mm_unpacklo_epi16(b, pixels[1]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
 
-    int j;
-    __m128i s[8];
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[10]);
+    sum0 = _mm_add_epi32(sum0, pred_round);
+    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
 
-    for (j = 0; j < 8; ++j) {
-      s[j] = _mm_madd_epi16(pixels[j], wh_sc);
-      s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j]));
-      s[j] = _mm_add_epi32(s[j], round);
-      s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale);
-    }
+    sum1 = _mm_add_epi32(sum1, pred_round);
+    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
 
-    for (j = 0; j < 8; j += 2) {
-      __m128i sum = _mm_packus_epi16(s[j], s[j + 1]);
-      sum = _mm_shuffle_epi8(sum, gat);
-      _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum);
-    }
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
     dst += stride;
+
     rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[11];
-  load_pixel_w32(above, left, 16, pixels);
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 4, pixels);
 
-  __m128i wh[4], ww[8];
-  load_weight_w32(sm_weight_arrays, 16, wh, ww);
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 4, ww);
 
-  smooth_pred_32x8(pixels, wh, ww, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
 }
 
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[11];
-  load_pixel_w32(above, left, 32, pixels);
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 8, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 8, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 16, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 16, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
+}
 
-  __m128i wh[8], ww[8];
-  load_weight_w32(sm_weight_arrays, 32, wh, ww);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_h_w8(above, left, 32, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 32, ww);
 
-  smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0);
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2);
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3);
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i tr_ly =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
+      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
+      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
+      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
+      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
+      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_round);
+      pred_hi = _mm_add_epi32(pred_hi, pred_round);
+
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
 }
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm
deleted file mode 100644
index bc1bb2ff3..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm
+++ /dev/null
@@ -1,410 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pb_1: times 16 db 1
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
-sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
-sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
-sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM ssse3
-cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
-  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  lea               dstq, [dstq+strideq*2]
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]               ; l1, l2, l3, l4
-  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
-  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
-  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
-  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
-  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1
-  ; A2 B2 A1 B1
-  ; A3 B3 A2 B2
-  ; A4 B4 A3 B3
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
-  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
-
-  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+stride3q ], m3
-  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq*2], m3
-  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq  ], m3
-  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
-  movd  [dstq          ], m3
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
-  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
-  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
-  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
-  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
-  psrldq              m4, m0, 1                       ; t1-7 [word]
-  psrldq              m5, m0, 2                       ; t2-7 [word]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1
-  ; A2 B2 A1 B1 C1 D1 E1 F1
-  ; A3 B3 A2 B2 A1 B1 C1 D1
-  ; A4 B4 A3 B3 A2 B2 A1 B1
-  ; A5 B5 A4 B4 A3 B3 A2 B2
-  ; A6 B6 A5 B5 A4 B4 A3 B3
-  ; A7 B7 A6 B6 A5 B5 A4 B4
-  ; A8 B8 A7 B7 A6 B6 A5 B5
-  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
-
-  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-
-  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
-  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2                     ; A-H1
-  movq  [dstq          ], m0
-  lea               dstq, [dstq+strideq*4]
-  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
-  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
-  movq  [dstq+strideq*2], m6
-  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
-  movq  [dstq+strideq  ], m6
-  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
-  movq  [dstq          ], m6
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                m0, [leftq]
-  movu                m7, [aboveq-1]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
-  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
-  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
-  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
-  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
-  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
-  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
-  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
-  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
-  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
-  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
-  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
-  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
-  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
-  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
-  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
-  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr             m5, m0, m6, 15
-  palignr             m3, m0, m6, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
-  pavgb               m5, m0                            ; A1 - Ag
-
-  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
-
-  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
-
-  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  palignr             m2, m1, m6, 14
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m1, m6, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m1, m6, 6
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 2
-  mova  [dstq+strideq*2], m2
-  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
-  mova  [dstq+stride3q ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  palignr             m2, m6, m4, 14
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m6, m4, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m6, m4, 6
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 2
-  mova  [dstq+strideq*2], m2
-  mova  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                  m0, [leftq]
-  movu                  m7, [aboveq-1]
-  movu                  m1, [aboveq+15]
-
-  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
-
-  palignr               m3, m1, m7, 1
-  palignr               m5, m1, m7, 2
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
-
-  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr               m5, m0, m7, 15
-  palignr               m3, m0, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pavgb                 m5, m0                            ; A1 - Ag
-  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
-  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
-
-  DEFINE_ARGS dst, stride, stride3, left, line
-  lea             stride3q, [strideq*3]
-
-  palignr               m5, m2, m1, 14
-  palignr               m7, m1, m6, 14
-  mova  [dstq            ], m7
-  mova  [dstq+16         ], m5
-  palignr               m5, m2, m1, 12
-  palignr               m7, m1, m6, 12
-  mova  [dstq+strideq    ], m7
-  mova  [dstq+strideq+16 ], m5
-  palignr                m5, m2, m1, 10
-  palignr                m7, m1, m6, 10
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m2, m1, 8
-  palignr                m7, m1, m6, 8
-  mova  [dstq+stride3q    ], m7
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m2, m1, 6
-  palignr                m7, m1, m6, 6
-  mova  [dstq             ], m7
-  mova  [dstq+16          ], m5
-  palignr                m5, m2, m1, 4
-  palignr                m7, m1, m6, 4
-  mova  [dstq+strideq     ], m7
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m2, m1, 2
-  palignr                m7, m1, m6, 2
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m6
-  mova  [dstq+stride3q+16 ], m1
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m5, m1, m6, 14
-  palignr                m3, m6, m4, 14
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 12
-  palignr                m3, m6, m4, 12
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 10
-  palignr                m3, m6, m4, 10
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m1, m6, 8
-  palignr                m3, m6, m4, 8
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m1, m6, 6
-  palignr                m3, m6, m4, 6
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 4
-  palignr                m3, m6, m4, 4
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 2
-  palignr                m3, m6, m4, 2
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m4
-  mova  [dstq+stride3q+16 ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  mova                   m7, [leftq]
-  mova                   m3, [leftq+16]
-  palignr                m5, m3, m7, 15
-  palignr                m0, m3, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
-  pavgb                  m5, m3                            ; Ah -
-  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
-  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
-  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
-
-  palignr                m7, m6, m4, 14
-  palignr                m0, m4, m3, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 12
-  palignr                m0, m4, m3, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 10
-  palignr                m0, m4, m3, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m6, m4, 8
-  palignr                m0, m4, m3, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m6, m4, 6
-  palignr                m0, m4, m3, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 4
-  palignr                m0, m4, m3, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 2
-  palignr                m0, m4, m3, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m4
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m7, m4, m3, 14
-  palignr                m0, m3, m2, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 12
-  palignr                m0, m3, m2, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 10
-  palignr                m0, m3, m2, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m4, m3, 8
-  palignr                m0, m3, m2, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m4, m3, 6
-  palignr                m0, m3, m2, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 4
-  palignr                m0, m3, m2, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 2
-  palignr                m0, m3, m2, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m2
-  mova  [dstq+stride3q+16 ], m3
-
-  RESTORE_GOT
-  RET
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c
deleted file mode 100644
index a9d6a127c..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c
+++ /dev/null
@@ -1,1238 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  __m256i in[16];
-  load_buffer_16x16(input, in);
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) {
-  const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]);
-  const __m256i v0 = _mm256_unpacklo_epi32(u0, u1);
-  const __m256i v1 = _mm256_unpackhi_epi32(u0, u1);
-  in[0] = _mm256_permute4x64_epi64(v0, 0xA8);
-  in[1] = _mm256_permute4x64_epi64(v0, 0xA9);
-  in[2] = _mm256_permute4x64_epi64(v1, 0xA8);
-  in[3] = _mm256_permute4x64_epi64(v1, 0xA9);
-}
-
-#define MM256_SHUFFLE_EPI64(x0, x1, imm8)                        \
-  _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \
-                                        _mm256_castsi256_pd(x1), imm8))
-
-static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) {
-  int i;
-  for (i = 0; i < 16; i += 4) {
-    transpose_col_to_row_nz4x4(&in[i]);
-  }
-
-  for (i = 0; i < 4; ++i) {
-    in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0);
-    in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0);
-  }
-
-  for (i = 0; i < 4; ++i) {
-    in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
-  }
-}
-
-// Coefficients 0-7 before the final butterfly
-static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28);
-  const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04);
-
-  const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i v1 = v0;
-
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  __m256i v5, v6;
-  unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6);
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v1, v5);
-  out[3] = _mm256_add_epi16(v0, v4);
-  out[4] = _mm256_sub_epi16(v0, v4);
-  out[5] = _mm256_sub_epi16(v1, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-// Coefficients 8-15 before the final butterfly
-static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
-  const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
-
-  const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-  const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
-  const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
-
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  __m256i t1, t2, t5, t6;
-  unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  out[0] = _mm256_add_epi16(t0, t3);
-  out[1] = _mm256_add_epi16(t1, t2);
-  out[6] = _mm256_add_epi16(t6, t5);
-  out[7] = _mm256_add_epi16(t7, t4);
-
-  const __m256i v2 = _mm256_sub_epi16(t1, t2);
-  const __m256i v3 = _mm256_sub_epi16(t0, t3);
-  const __m256i v4 = _mm256_sub_epi16(t7, t4);
-  const __m256i v5 = _mm256_sub_epi16(t6, t5);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
-}
-
-static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-static INLINE void idct16_10(__m256i *in /*in[16]*/) {
-  __m256i out[16];
-  idct16_10_first_half(in, out);
-  idct16_10_second_half(in, &out[8]);
-  add_sub_butterfly(out, in, 16);
-}
-
-void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[16];
-
-  load_coeff(input, &in[0]);
-  load_coeff(input + 16, &in[1]);
-  load_coeff(input + 32, &in[2]);
-  load_coeff(input + 48, &in[3]);
-
-  transpose_col_to_row_nz4x4(in);
-  idct16_10(in);
-
-  transpose_col_to_row_nz4x16(in);
-  idct16_10(in);
-
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-// Note:
-//  For 16x16 int16_t matrix
-//  transpose first 8 columns into first 8 rows.
-//  Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]).
-//  After transposing, the 8 row vectors are in in[8].
-void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) {
-  __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]);
-
-  const __m256i v0 = _mm256_unpacklo_epi32(u0, u2);
-  const __m256i v1 = _mm256_unpackhi_epi32(u0, u2);
-  const __m256i v2 = _mm256_unpacklo_epi32(u1, u3);
-  const __m256i v3 = _mm256_unpackhi_epi32(u1, u3);
-
-  u0 = _mm256_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm256_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm256_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  const __m256i v4 = _mm256_unpacklo_epi32(u0, u2);
-  const __m256i v5 = _mm256_unpackhi_epi32(u0, u2);
-  const __m256i v6 = _mm256_unpacklo_epi32(u1, u3);
-  const __m256i v7 = _mm256_unpackhi_epi32(u1, u3);
-
-  in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0);
-  in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3);
-  in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0);
-  in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3);
-  in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0);
-  in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3);
-  in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0);
-  in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3);
-}
-
-// Note:
-//  For 16x16 int16_t matrix
-//  transpose first 8 columns into first 8 rows.
-//  Since only matrix left 8x16 are non-zero, the input are total 16 rows
-//  (in[16]).
-//  After transposing, the 8 row vectors are in in[8]. All else are zero.
-static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) {
-  transpose_col_to_row_nz8x8(in);
-  transpose_col_to_row_nz8x8(&in[8]);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
-  }
-}
-
-static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28);
-  __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04);
-
-  const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20);
-  __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12);
-
-  const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24);
-  const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08);
-
-  const __m256i u4 = _mm256_add_epi16(t4, t5);
-  const __m256i u5 = _mm256_sub_epi16(t4, t5);
-  const __m256i u6 = _mm256_sub_epi16(t7, t6);
-  const __m256i u7 = _mm256_add_epi16(t7, t6);
-
-  const __m256i t0 = _mm256_add_epi16(u0, u3);
-  const __m256i t1 = _mm256_add_epi16(u1, u2);
-  const __m256i t2 = _mm256_sub_epi16(u1, u2);
-  const __m256i t3 = _mm256_sub_epi16(u0, u3);
-
-  t4 = u4;
-  t7 = u7;
-
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
-
-  out[0] = _mm256_add_epi16(t0, t7);
-  out[1] = _mm256_add_epi16(t1, t6);
-  out[2] = _mm256_add_epi16(t2, t5);
-  out[3] = _mm256_add_epi16(t3, t4);
-  out[4] = _mm256_sub_epi16(t3, t4);
-  out[5] = _mm256_sub_epi16(t2, t5);
-  out[6] = _mm256_sub_epi16(t1, t6);
-  out[7] = _mm256_sub_epi16(t0, t7);
-}
-
-static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
-  __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
-
-  const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-  const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-  __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18);
-  __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14);
-
-  const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-  const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-  __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22);
-  __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10);
-
-  const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-  __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
-  __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
-
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-  v0 = _mm256_add_epi16(t0, t1);
-  v1 = _mm256_sub_epi16(t0, t1);
-  v2 = _mm256_sub_epi16(t3, t2);
-  v3 = _mm256_add_epi16(t2, t3);
-  v4 = _mm256_add_epi16(t4, t5);
-  v5 = _mm256_sub_epi16(t4, t5);
-  v6 = _mm256_sub_epi16(t7, t6);
-  v7 = _mm256_add_epi16(t6, t7);
-
-  t0 = v0;
-  t7 = v7;
-  t3 = v3;
-  t4 = v4;
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  v0 = _mm256_add_epi16(t0, t3);
-  v1 = _mm256_add_epi16(t1, t2);
-  v2 = _mm256_sub_epi16(t1, t2);
-  v3 = _mm256_sub_epi16(t0, t3);
-  v4 = _mm256_sub_epi16(t7, t4);
-  v5 = _mm256_sub_epi16(t6, t5);
-  v6 = _mm256_add_epi16(t6, t5);
-  v7 = _mm256_add_epi16(t7, t4);
-
-  // stage 6, (8-15)
-  out[0] = v0;
-  out[1] = v1;
-  out[6] = v6;
-  out[7] = v7;
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
-}
-
-static INLINE void idct16_38(__m256i *in /*in[16]*/) {
-  __m256i out[16];
-  idct16_38_first_half(in, out);
-  idct16_38_second_half(in, &out[8]);
-  add_sub_butterfly(out, in, 16);
-}
-
-void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[16];
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    load_coeff(input + (i << 4), &in[i]);
-  }
-
-  transpose_col_to_row_nz8x8(in);
-  idct16_38(in);
-
-  transpose_col_to_row_nz8x16(in);
-  idct16_38(in);
-
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-static INLINE int calculate_dc(const tran_low_t *input) {
-  int dc = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  dc = (int)dct_const_round_shift(dc * cospi_16_64);
-  dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS);
-  return dc;
-}
-
-void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const int dc = calculate_dc(input);
-  if (dc == 0) return;
-
-  const __m256i dc_value = _mm256_set1_epi16(dc);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    recon_and_store(&dc_value, dest);
-    dest += stride;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// 32x32 partial IDCT
-
-void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const int dc = calculate_dc(input);
-  if (dc == 0) return;
-
-  const __m256i dc_value = _mm256_set1_epi16(dc);
-
-  int i;
-  for (i = 0; i < 32; ++i) {
-    recon_and_store(&dc_value, dest);
-    recon_and_store(&dc_value, dest + 16);
-    dest += stride;
-  }
-}
-
-static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    load_coeff(input, &in[i]);
-    load_coeff(input + 16, &in[i + 16]);
-    input += 32;
-  }
-}
-
-// Note:
-//  We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we
-// operate coefficients on __m256i. Our operation capacity doubles for each
-// instruction.
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)            \
-  do {                                              \
-    tmp0 = _mm256_madd_epi16(x0, co0);              \
-    tmp1 = _mm256_madd_epi16(x1, co0);              \
-    tmp2 = _mm256_madd_epi16(x0, co1);              \
-    tmp3 = _mm256_madd_epi16(x1, co1);              \
-    tmp0 = _mm256_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm256_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm256_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm256_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m256i *x0, const __m256i *x1,
-                             const __m256i *c0, const __m256i *c1, __m256i *y0,
-                             __m256i *y1) {
-  __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm256_unpacklo_epi16(*x0, *x1);
-  u1 = _mm256_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm256_packs_epi32(tmp0, tmp1);
-  *y1 = _mm256_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0,
-                                  const __m256i *c1) {
-  __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm256_unpacklo_epi16(*x0, *x1);
-  u1 = _mm256_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm256_packs_epi32(tmp0, tmp1);
-  *x1 = _mm256_packs_epi32(tmp2, tmp3);
-}
-
-// For each 16x32 block __m256i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m256i in[32]
-static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/,
-                                        __m256i *out /*out[16]*/) {
-  __m256i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m256i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64);
-    const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm256_add_epi16(u8, u9);
-  v9 = _mm256_sub_epi16(u8, u9);
-  v14 = _mm256_sub_epi16(u15, u14);
-  v15 = _mm256_add_epi16(u15, u14);
-
-  {
-    const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64);
-    const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm256_sub_epi16(u11, u10);
-  v11 = _mm256_add_epi16(u11, u10);
-  v12 = _mm256_add_epi16(u12, u13);
-  v13 = _mm256_sub_epi16(u12, u13);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(v8, v11);
-  out[1] = _mm256_add_epi16(v9, v10);
-  out[6] = _mm256_add_epi16(v14, v13);
-  out[7] = _mm256_add_epi16(v15, v12);
-
-  out[2] = _mm256_sub_epi16(v9, v10);
-  out[3] = _mm256_sub_epi16(v8, v11);
-  out[4] = _mm256_sub_epi16(v15, v12);
-  out[5] = _mm256_sub_epi16(v14, v13);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m256i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m256i in[32]
-static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/,
-                                        __m256i *out /*out[8]*/) {
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-    const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm256_add_epi16(u4, u5);
-  v5 = _mm256_sub_epi16(u4, u5);
-  v6 = _mm256_sub_epi16(u7, u6);
-  v7 = _mm256_add_epi16(u7, u6);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm256_add_epi16(u0, u3);
-  v1 = _mm256_add_epi16(u1, u2);
-  v2 = _mm256_sub_epi16(u1, u2);
-  v3 = _mm256_sub_epi16(u0, u3);
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v2, v5);
-  out[3] = _mm256_add_epi16(v3, v4);
-  out[4] = _mm256_sub_epi16(v3, v4);
-  out[5] = _mm256_sub_epi16(v2, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m256i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m256i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/,
-                                          __m256i *out /*out[16]*/) {
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-    const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-    const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-    const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-    const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-    const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-    const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-    const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm256_add_epi16(u16, u17);
-  v17 = _mm256_sub_epi16(u16, u17);
-  v18 = _mm256_sub_epi16(u19, u18);
-  v19 = _mm256_add_epi16(u19, u18);
-
-  v20 = _mm256_add_epi16(u20, u21);
-  v21 = _mm256_sub_epi16(u20, u21);
-  v22 = _mm256_sub_epi16(u23, u22);
-  v23 = _mm256_add_epi16(u23, u22);
-
-  v24 = _mm256_add_epi16(u24, u25);
-  v25 = _mm256_sub_epi16(u24, u25);
-  v26 = _mm256_sub_epi16(u27, u26);
-  v27 = _mm256_add_epi16(u27, u26);
-
-  v28 = _mm256_add_epi16(u28, u29);
-  v29 = _mm256_sub_epi16(u28, u29);
-  v30 = _mm256_sub_epi16(u31, u30);
-  v31 = _mm256_add_epi16(u31, u30);
-
-  {
-    const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-    const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-    const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-
-  u24 = _mm256_add_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u27 = _mm256_sub_epi16(v24, v27);
-
-  u28 = _mm256_sub_epi16(v31, v28);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-  u31 = _mm256_add_epi16(v28, v31);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(u16, u23);
-  out[1] = _mm256_add_epi16(u17, u22);
-  out[2] = _mm256_add_epi16(u18, u21);
-  out[3] = _mm256_add_epi16(u19, u20);
-  out[4] = _mm256_sub_epi16(u19, u20);
-  out[5] = _mm256_sub_epi16(u18, u21);
-  out[6] = _mm256_sub_epi16(u17, u22);
-  out[7] = _mm256_sub_epi16(u16, u23);
-
-  out[8] = _mm256_sub_epi16(u31, u24);
-  out[9] = _mm256_sub_epi16(u30, u25);
-  out[10] = _mm256_sub_epi16(u29, u26);
-  out[11] = _mm256_sub_epi16(u28, u27);
-  out[12] = _mm256_add_epi16(u27, u28);
-  out[13] = _mm256_add_epi16(u26, u29);
-  out[14] = _mm256_add_epi16(u25, u30);
-  out[15] = _mm256_add_epi16(u24, u31);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/,
-                                          __m256i *out /*out[32]*/) {
-  __m256i temp[16];
-  idct32_full_16x32_quarter_1(in, temp);
-  idct32_full_16x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_16x32(const __m256i *in /*in[32]*/,
-                         __m256i *out /*out[32]*/) {
-  __m256i temp[32];
-  idct32_full_16x32_quarter_1_2(in, temp);
-  idct32_full_16x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  __m256i col[64], in[32];
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    load_buffer_32x16(input, in);
-    input += 32 << 4;
-
-    mm256_transpose_16x16(in, in);
-    mm256_transpose_16x16(&in[16], &in[16]);
-    idct32_16x32(in, col + (i << 5));
-  }
-
-  for (i = 0; i < 2; ++i) {
-    int j = i << 4;
-    mm256_transpose_16x16(col + j, in);
-    mm256_transpose_16x16(col + j + 32, &in[16]);
-    idct32_16x32(in, in);
-    store_buffer_16xN(in, stride, dest, 32);
-    dest += 16;
-  }
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/,
-                                       __m256i *out /*out[8]*/) {
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm256_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm256_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm256_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm256_add_epi16(u0, u3);
-  v1 = _mm256_add_epi16(u1, u2);
-  v2 = _mm256_sub_epi16(u1, u2);
-  v3 = _mm256_sub_epi16(u0, u3);
-
-  {
-    const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m256i stk3_2 =
-        pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm256_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm256_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm256_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm256_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm256_add_epi16(u4, u5);
-  v5 = _mm256_sub_epi16(u4, u5);
-  v6 = _mm256_sub_epi16(u7, u6);
-  v7 = _mm256_add_epi16(u7, u6);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v2, v5);
-  out[3] = _mm256_add_epi16(v3, v4);
-  out[4] = _mm256_sub_epi16(v3, v4);
-  out[5] = _mm256_sub_epi16(v2, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/,
-                                       __m256i *out /*out[8]*/) {
-  __m256i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m256i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m256i stk2_2 =
-        pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m256i stk2_6 =
-        pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm256_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm256_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm256_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm256_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm256_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm256_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm256_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm256_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm256_add_epi16(u8, u9);
-  v9 = _mm256_sub_epi16(u8, u9);
-  v10 = _mm256_sub_epi16(u11, u10);
-  v11 = _mm256_add_epi16(u11, u10);
-  v12 = _mm256_add_epi16(u12, u13);
-  v13 = _mm256_sub_epi16(u12, u13);
-  v14 = _mm256_sub_epi16(u15, u14);
-  v15 = _mm256_add_epi16(u15, u14);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(v8, v11);
-  out[1] = _mm256_add_epi16(v9, v10);
-  out[2] = _mm256_sub_epi16(v9, v10);
-  out[3] = _mm256_sub_epi16(v8, v11);
-  out[4] = _mm256_sub_epi16(v15, v12);
-  out[5] = _mm256_sub_epi16(v14, v13);
-  out[6] = _mm256_add_epi16(v14, v13);
-  out[7] = _mm256_add_epi16(v15, v12);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/,
-                                     __m256i *out /*out[32]*/) {
-  __m256i temp[16];
-  idct32_16x32_135_quarter_1(in, temp);
-  idct32_16x32_135_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/,
-                                     __m256i *out /*out[32]*/) {
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m256i stk1_2 =
-        pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m256i stk1_6 =
-        pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m256i stk1_10 =
-        pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m256i stk1_14 =
-        pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm256_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm256_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm256_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm256_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm256_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm256_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm256_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm256_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm256_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm256_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm256_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm256_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm256_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm256_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm256_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm256_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm256_add_epi16(u16, u17);
-  v17 = _mm256_sub_epi16(u16, u17);
-  v18 = _mm256_sub_epi16(u19, u18);
-  v19 = _mm256_add_epi16(u19, u18);
-
-  v20 = _mm256_add_epi16(u20, u21);
-  v21 = _mm256_sub_epi16(u20, u21);
-  v22 = _mm256_sub_epi16(u23, u22);
-  v23 = _mm256_add_epi16(u23, u22);
-
-  v24 = _mm256_add_epi16(u24, u25);
-  v25 = _mm256_sub_epi16(u24, u25);
-  v26 = _mm256_sub_epi16(u27, u26);
-  v27 = _mm256_add_epi16(u27, u26);
-
-  v28 = _mm256_add_epi16(u28, u29);
-  v29 = _mm256_sub_epi16(u28, u29);
-  v30 = _mm256_sub_epi16(u31, u30);
-  v31 = _mm256_add_epi16(u31, u30);
-
-  {
-    const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-    const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-    const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-
-  u24 = _mm256_add_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u27 = _mm256_sub_epi16(v24, v27);
-  u28 = _mm256_sub_epi16(v31, v28);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-  u31 = _mm256_add_epi16(v28, v31);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(u16, u23);
-  out[1] = _mm256_add_epi16(u17, u22);
-  out[2] = _mm256_add_epi16(u18, u21);
-  out[3] = _mm256_add_epi16(u19, u20);
-  v20 = _mm256_sub_epi16(u19, u20);
-  v21 = _mm256_sub_epi16(u18, u21);
-  v22 = _mm256_sub_epi16(u17, u22);
-  v23 = _mm256_sub_epi16(u16, u23);
-
-  v24 = _mm256_sub_epi16(u31, u24);
-  v25 = _mm256_sub_epi16(u30, u25);
-  v26 = _mm256_sub_epi16(u29, u26);
-  v27 = _mm256_sub_epi16(u28, u27);
-  out[12] = _mm256_add_epi16(u27, u28);
-  out[13] = _mm256_add_epi16(u26, u29);
-  out[14] = _mm256_add_epi16(u25, u30);
-  out[15] = _mm256_add_epi16(u24, u31);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
-static void idct32_16x32_135(__m256i *in /*in[32]*/) {
-  __m256i out[32];
-  idct32_16x32_quarter_1_2(in, out);
-  idct32_16x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in,
-                                          int size) {
-  int i = 0;
-  while (i < size) {
-    load_coeff(coeff + (i << 5), &in[i]);
-    i += 1;
-  }
-}
-
-static INLINE void zero_buffer(__m256i *in, int num) {
-  int i;
-  for (i = 0; i < num; ++i) {
-    in[i] = _mm256_setzero_si256();
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  __m256i in[32];
-  zero_buffer(in, 32);
-  load_buffer_from_32x32(input, in, 16);
-  mm256_transpose_16x16(in, in);
-  idct32_16x32_135(in);
-
-  __m256i out[32];
-  mm256_transpose_16x16(in, out);
-  idct32_16x32_135(out);
-  store_buffer_16xN(out, stride, dest, 32);
-  mm256_transpose_16x16(&in[16], in);
-  idct32_16x32_135(in);
-  store_buffer_16xN(in, stride, dest + 16, 32);
-}
-
-static void idct32_34_first_half(const __m256i *in, __m256i *stp1) {
-  const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i x0, x1, x4, x5, x6, x7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm256_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm256_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm256_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm256_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm256_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm256_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm256_add_epi16(v0, v15);
-  stp1[15] = _mm256_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm256_mulhrs_epi16(in[2], stk2_0);          // stp2_8
-  u1 = _mm256_mulhrs_epi16(in[6], stk2_6);          // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm256_add_epi16(u0, u1);
-  v9 = _mm256_add_epi16(u4, u6);
-  v10 = _mm256_sub_epi16(u4, u6);
-  v11 = _mm256_sub_epi16(u0, u1);
-  v12 = _mm256_sub_epi16(u2, u3);
-  v13 = _mm256_sub_epi16(u5, u7);
-  v14 = _mm256_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm256_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm256_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm256_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm256_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm256_add_epi16(v1, v14);
-  stp1[14] = _mm256_sub_epi16(v1, v14);
-
-  stp1[2] = _mm256_add_epi16(v2, v13);
-  stp1[13] = _mm256_sub_epi16(v2, v13);
-
-  v3 = _mm256_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm256_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm256_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm256_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm256_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm256_add_epi16(v3, v12);
-  stp1[12] = _mm256_sub_epi16(v3, v12);
-
-  stp1[6] = _mm256_add_epi16(v6, v9);
-  stp1[9] = _mm256_sub_epi16(v6, v9);
-
-  stp1[7] = _mm256_add_epi16(v7, v8);
-  stp1[8] = _mm256_sub_epi16(v7, v8);
-
-  stp1[4] = _mm256_add_epi16(v4, v11);
-  stp1[11] = _mm256_sub_epi16(v4, v11);
-
-  stp1[5] = _mm256_add_epi16(v5, v10);
-  stp1[10] = _mm256_sub_epi16(v5, v10);
-}
-
-static void idct32_34_second_half(const __m256i *in, __m256i *stp1) {
-  const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm256_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm256_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm256_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm256_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm256_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm256_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm256_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm256_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-  u24 = _mm256_add_epi16(v24, v27);
-  u27 = _mm256_sub_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u28 = _mm256_sub_epi16(v31, v28);
-  u31 = _mm256_add_epi16(v28, v31);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[0] = _mm256_add_epi16(u16, u23);
-  stp1[7] = _mm256_sub_epi16(u16, u23);
-
-  stp1[1] = _mm256_add_epi16(u17, u22);
-  stp1[6] = _mm256_sub_epi16(u17, u22);
-
-  stp1[2] = _mm256_add_epi16(u18, u21);
-  stp1[5] = _mm256_sub_epi16(u18, u21);
-
-  stp1[3] = _mm256_add_epi16(u19, u20);
-  stp1[4] = _mm256_sub_epi16(u19, u20);
-
-  stp1[8] = _mm256_sub_epi16(u31, u24);
-  stp1[15] = _mm256_add_epi16(u24, u31);
-
-  stp1[9] = _mm256_sub_epi16(u30, u25);
-  stp1[14] = _mm256_add_epi16(u25, u30);
-
-  stp1[10] = _mm256_sub_epi16(u29, u26);
-  stp1[13] = _mm256_add_epi16(u26, u29);
-
-  stp1[11] = _mm256_sub_epi16(u28, u27);
-  stp1[12] = _mm256_add_epi16(u27, u28);
-
-  butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0);
-}
-
-// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
-static void idct32_16x32_34(__m256i *in /*in[32]*/) {
-  __m256i out[32];
-  idct32_34_first_half(in, out);
-  idct32_34_second_half(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[32];
-  zero_buffer(in, 32);
-  load_buffer_from_32x32(input, in, 8);
-  mm256_transpose_16x16(in, in);
-  idct32_16x32_34(in);
-
-  __m256i out[32];
-  mm256_transpose_16x16(in, out);
-  idct32_16x32_34(out);
-  store_buffer_16xN(out, stride, dest, 32);
-  mm256_transpose_16x16(&in[16], in);
-  idct32_16x32_34(in);
-  store_buffer_16xN(in, stride, dest + 16, 32);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
deleted file mode 100644
index 26c5cfe59..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
-#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
-
-#include <immintrin.h>
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-  if (sizeof(tran_low_t) == 4) {
-    *in = _mm256_setr_epi16(
-        (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
-        (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
-        (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
-        (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
-        (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
-        (int16_t)coeff[15]);
-  } else {
-    *in = _mm256_loadu_si256((const __m256i *)coeff);
-  }
-}
-
-static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
-  int i = 0;
-  while (i < 16) {
-    load_coeff(coeff + (i << 4), &in[i]);
-    i += 1;
-  }
-}
-
-static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x = _mm_loadu_si128((__m128i const *)output);
-  __m128i p0 = _mm_unpacklo_epi8(x, zero);
-  __m128i p1 = _mm_unpackhi_epi8(x, zero);
-
-  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
-  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
-  x = _mm_packus_epi16(p0, p1);
-  _mm_storeu_si128((__m128i *)output, x);
-}
-
-#define IDCT_ROUNDING_POS (6)
-static INLINE void store_buffer_16xN(__m256i *in, const int stride,
-                                     uint8_t *output, int num) {
-  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
-  int i = 0;
-
-  while (i < num) {
-    in[i] = _mm256_adds_epi16(in[i], rounding);
-    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
-    recon_and_store(&in[i], output + i * stride);
-    i += 1;
-  }
-}
-
-static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
-                                     const __m256i *c0, const __m256i *c1,
-                                     __m256i *b0, __m256i *b1) {
-  __m256i x0, x1;
-  x0 = _mm256_unpacklo_epi16(*a0, *a1);
-  x1 = _mm256_unpackhi_epi16(*a0, *a1);
-  *b0 = butter_fly(&x0, &x1, c0);
-  *b1 = butter_fly(&x0, &x1, c1);
-}
-
-void av1_idct16_avx2(__m256i *in);
-
-#endif  // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
deleted file mode 100644
index 86ce928b7..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
+++ /dev/null
@@ -1,3500 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-void aom_idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  array_transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void aom_iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  array_transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
-  }
-
-void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from aom_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void aom_idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from aom_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
-}
-
-void aom_iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest + 0, dc_value);
-    RECON_AND_STORE(dest + 8, dc_value);
-    dest += stride;
-  }
-}
-
-void iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
-}
-
-void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
-
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-#define IDCT32(in0, in1)                                                       \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]);           \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]);           \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]);          \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]);          \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]);            \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]);            \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]);            \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]);            \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]);           \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]);           \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]);          \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]);          \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]);          \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]);          \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]);           \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]);           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]);           \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]);           \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]);          \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]);          \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]);          \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]);          \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]);           \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]);           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]);           \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]);           \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]);          \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]);          \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]);            \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]);            \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]);            \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]);            \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32(in, in + 16)
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32(in, in + 16)
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
-  }
-}
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its input - the caller is expected to have done that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void idct32_8col(__m128i *in0, __m128i *in1) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  IDCT32(in0, in1)
-
-  // 2_D: Calculate the results and store them to destination.
-  in0[0] = _mm_add_epi16(stp1_0, stp1_31);
-  in0[1] = _mm_add_epi16(stp1_1, stp1_30);
-  in0[2] = _mm_add_epi16(stp1_2, stp1_29);
-  in0[3] = _mm_add_epi16(stp1_3, stp1_28);
-  in0[4] = _mm_add_epi16(stp1_4, stp1_27);
-  in0[5] = _mm_add_epi16(stp1_5, stp1_26);
-  in0[6] = _mm_add_epi16(stp1_6, stp1_25);
-  in0[7] = _mm_add_epi16(stp1_7, stp1_24);
-  in0[8] = _mm_add_epi16(stp1_8, stp1_23);
-  in0[9] = _mm_add_epi16(stp1_9, stp1_22);
-  in0[10] = _mm_add_epi16(stp1_10, stp1_21);
-  in0[11] = _mm_add_epi16(stp1_11, stp1_20);
-  in0[12] = _mm_add_epi16(stp1_12, stp1_19);
-  in0[13] = _mm_add_epi16(stp1_13, stp1_18);
-  in0[14] = _mm_add_epi16(stp1_14, stp1_17);
-  in0[15] = _mm_add_epi16(stp1_15, stp1_16);
-  in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
-  in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
-  in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
-  in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
-  in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
-  in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
-  in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
-  in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
-  in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
-  in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
-  in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
-  in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
-  in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
-  in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
-  in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
-  in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
deleted file mode 100644
index 342816977..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
-// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
-  if (sizeof(tran_low_t) == 4) {
-    return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                          data[6], data[7]);
-  } else {
-    return _mm_load_si128((const __m128i *)data);
-  }
-}
-
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
-}
-
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-void iadst16_8col(__m128i *in);
-void idct16_8col(__m128i *in);
-void aom_idct4_sse2(__m128i *in);
-void aom_idct8_sse2(__m128i *in);
-void aom_idct16_sse2(__m128i *in0, __m128i *in1);
-void aom_iadst4_sse2(__m128i *in);
-void aom_iadst8_sse2(__m128i *in);
-void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
-void idct32_8col(__m128i *in0, __m128i *in1);
-
-#endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
deleted file mode 100644
index 9d006797b..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
+++ /dev/null
@@ -1,1333 +0,0 @@
-/*
- *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    {
-      /* Stage1 */
-      {
-        const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
-        const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
-        const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
-        const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
-
-        {
-          tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-          tmp1 = _mm_madd_epi16(hi_17, stg1_0);
-          tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-          tmp3 = _mm_madd_epi16(hi_17, stg1_1);
-          tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-          tmp5 = _mm_madd_epi16(hi_35, stg1_2);
-          tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-          tmp7 = _mm_madd_epi16(hi_35, stg1_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-          tmp6 = _mm_add_epi32(tmp6, rounding);
-          tmp7 = _mm_add_epi32(tmp7, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-          tmp4 = _mm_srai_epi32(tmp4, 14);
-          tmp5 = _mm_srai_epi32(tmp5, 14);
-          tmp6 = _mm_srai_epi32(tmp6, 14);
-          tmp7 = _mm_srai_epi32(tmp7, 14);
-
-          stp1_4 = _mm_packs_epi32(tmp0, tmp1);
-          stp1_7 = _mm_packs_epi32(tmp2, tmp3);
-          stp1_5 = _mm_packs_epi32(tmp4, tmp5);
-          stp1_6 = _mm_packs_epi32(tmp6, tmp7);
-        }
-      }
-
-      /* Stage2 */
-      {
-        const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
-        const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
-
-        {
-          tmp0 = _mm_unpacklo_epi16(in0, in4);
-          tmp1 = _mm_unpackhi_epi16(in0, in4);
-
-          tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-          tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-          tmp4 = _mm_madd_epi16(tmp0, stk2_1);
-          tmp5 = _mm_madd_epi16(tmp1, stk2_1);
-
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-
-          tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-          tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-          tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-          tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-          stp2_0 = _mm_packs_epi32(tmp2, tmp3);
-          stp2_1 = _mm_packs_epi32(tmp4, tmp5);
-
-          tmp0 = _mm_madd_epi16(lo_26, stg2_2);
-          tmp1 = _mm_madd_epi16(hi_26, stg2_2);
-          tmp2 = _mm_madd_epi16(lo_26, stg2_3);
-          tmp3 = _mm_madd_epi16(hi_26, stg2_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-
-          stp2_2 = _mm_packs_epi32(tmp0, tmp1);
-          stp2_3 = _mm_packs_epi32(tmp2, tmp3);
-        }
-
-        stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-        stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-        stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-        stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-      }
-
-      /* Stage3 */
-      {
-        stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-        stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-        stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-        stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-        tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-        tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-        tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-        tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-        tmp4 = _mm_madd_epi16(tmp0, stk2_0);
-        tmp5 = _mm_madd_epi16(tmp1, stk2_0);
-
-        tmp2 = _mm_add_epi32(tmp2, rounding);
-        tmp3 = _mm_add_epi32(tmp3, rounding);
-        tmp4 = _mm_add_epi32(tmp4, rounding);
-        tmp5 = _mm_add_epi32(tmp5, rounding);
-
-        tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-        tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-        tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-        tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-        stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-        stp1_6 = _mm_packs_epi32(tmp4, tmp5);
-      }
-
-      /* Stage4  */
-      in0 = _mm_add_epi16(stp1_0, stp2_7);
-      in1 = _mm_add_epi16(stp1_1, stp1_6);
-      in2 = _mm_add_epi16(stp1_2, stp1_5);
-      in3 = _mm_add_epi16(stp1_3, stp2_4);
-      in4 = _mm_sub_epi16(stp1_3, stp2_4);
-      in5 = _mm_sub_epi16(stp1_2, stp1_5);
-      in6 = _mm_sub_epi16(stp1_1, stp1_6);
-      in7 = _mm_sub_epi16(stp1_0, stp2_7);
-    }
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-
-  // Stage1
-  tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
-  tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
-  tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
-  tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
-
-  stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
-  stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
-
-  // Stage2
-  tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-
-  tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
-  tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
-  stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
-
-  tmp0 = _mm_add_epi16(stp1_4, stp1_5);
-  tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
-  stp2_4 = tmp0;
-  stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-  stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-  tmp1 = _mm_madd_epi16(tmp0, stg3_0);
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);  // stg3_1 = stk2_0
-
-  tmp1 = _mm_add_epi32(tmp1, rounding);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-  stp1_5 = _mm_packs_epi32(tmp1, tmp2);
-
-  // Stage3
-  tmp2 = _mm_add_epi16(stp2_0, stp2_2);
-  tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
-
-  stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
-  stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
-
-  // Stage4
-  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  /* Stage1 */
-  stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
-  stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
-  stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
-  stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
-
-  /* Stage2 */
-  stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
-
-  stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
-  stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
-
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-  tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-
-  /* Stage4  */
-  in0 = _mm_add_epi16(stp1_0, stp2_7);
-  in1 = _mm_add_epi16(stp1_1, stp1_6);
-  in2 = _mm_add_epi16(stp1_2, stp1_5);
-  in3 = _mm_add_epi16(stp1_3, stp2_4);
-  in4 = _mm_sub_epi16(stp1_3, stp2_4);
-  in5 = _mm_sub_epi16(stp1_2, stp1_5);
-  in6 = _mm_sub_epi16(stp1_1, stp1_6);
-  in7 = _mm_sub_epi16(stp1_0, stp2_7);
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
-  do {                                           \
-    tmp0 = _mm_madd_epi16(x0, co0);              \
-    tmp1 = _mm_madd_epi16(x1, co0);              \
-    tmp2 = _mm_madd_epi16(x0, co1);              \
-    tmp3 = _mm_madd_epi16(x1, co1);              \
-    tmp0 = _mm_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
-                             const __m128i *c0, const __m128i *c1, __m128i *y0,
-                             __m128i *y1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm_packs_epi32(tmp0, tmp1);
-  *y1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
-                                  const __m128i *c1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm_packs_epi32(tmp0, tmp1);
-  *x1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x4, x5, x6, x7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm_add_epi16(v0, v15);
-  stp1[15] = _mm_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm_mulhrs_epi16(in[2], stk2_0);             // stp2_8
-  u1 = _mm_mulhrs_epi16(in[6], stk2_6);             // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm_add_epi16(u0, u1);
-  v9 = _mm_add_epi16(u4, u6);
-  v10 = _mm_sub_epi16(u4, u6);
-  v11 = _mm_sub_epi16(u0, u1);
-  v12 = _mm_sub_epi16(u2, u3);
-  v13 = _mm_sub_epi16(u5, u7);
-  v14 = _mm_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm_add_epi16(v1, v14);
-  stp1[14] = _mm_sub_epi16(v1, v14);
-
-  stp1[2] = _mm_add_epi16(v2, v13);
-  stp1[13] = _mm_sub_epi16(v2, v13);
-
-  v3 = _mm_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm_add_epi16(v3, v12);
-  stp1[12] = _mm_sub_epi16(v3, v12);
-
-  stp1[6] = _mm_add_epi16(v6, v9);
-  stp1[9] = _mm_sub_epi16(v6, v9);
-
-  stp1[7] = _mm_add_epi16(v7, v8);
-  stp1[8] = _mm_sub_epi16(v7, v8);
-
-  stp1[4] = _mm_add_epi16(v4, v11);
-  stp1[11] = _mm_sub_epi16(v4, v11);
-
-  stp1[5] = _mm_add_epi16(v5, v10);
-  stp1[10] = _mm_sub_epi16(v5, v10);
-}
-
-static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-  u24 = _mm_add_epi16(v24, v27);
-  u27 = _mm_sub_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u28 = _mm_sub_epi16(v31, v28);
-  u31 = _mm_add_epi16(v28, v31);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[16] = _mm_add_epi16(u16, u23);
-  stp1[23] = _mm_sub_epi16(u16, u23);
-
-  stp1[17] = _mm_add_epi16(u17, u22);
-  stp1[22] = _mm_sub_epi16(u17, u22);
-
-  stp1[18] = _mm_add_epi16(u18, u21);
-  stp1[21] = _mm_sub_epi16(u18, u21);
-
-  stp1[19] = _mm_add_epi16(u19, u20);
-  stp1[20] = _mm_sub_epi16(u19, u20);
-
-  stp1[24] = _mm_sub_epi16(u31, u24);
-  stp1[31] = _mm_add_epi16(u24, u31);
-
-  stp1[25] = _mm_sub_epi16(u30, u25);
-  stp1[30] = _mm_add_epi16(u25, u30);
-
-  stp1[26] = _mm_sub_epi16(u29, u26);
-  stp1[29] = _mm_add_epi16(u26, u29);
-
-  stp1[27] = _mm_sub_epi16(u28, u27);
-  stp1[28] = _mm_add_epi16(u27, u28);
-
-  butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  __m128i in[32], col[32];
-  __m128i stp1[32];
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  array_transpose_8x8(in, in);
-  idct32_34_first_half(in, stp1);
-  idct32_34_second_half(in, stp1);
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  add_sub_butterfly(stp1, col, 32);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    idct32_34_first_half(in, stp1);
-    idct32_34_second_half(in, stp1);
-
-    // 2_D: Calculate the results and store them to destination.
-    add_sub_butterfly(stp1, in, 32);
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-// in0[16] represents the left 8x16 block
-// in1[16] represents the right 8x16 block
-static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
-                              __m128i *in1) {
-  int i;
-  for (i = 0; i < 16; i++) {
-    in0[i] = load_input_data(input);
-    in1[i] = load_input_data(input + 8);
-    input += 32;
-  }
-}
-
-static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
-                                    __m128i *out1) {
-  array_transpose_8x8(in0, out0);
-  array_transpose_8x8(&in0[8], out1);
-  array_transpose_8x8(in1, &out0[8]);
-  array_transpose_8x8(&in1[8], &out1[8]);
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  {
-    const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_8x32_135_quarter_1(in, temp);
-  idct32_8x32_135_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  v20 = _mm_sub_epi16(u19, u20);
-  v21 = _mm_sub_epi16(u18, u21);
-  v22 = _mm_sub_epi16(u17, u22);
-  v23 = _mm_sub_epi16(u16, u23);
-
-  v24 = _mm_sub_epi16(u31, u24);
-  v25 = _mm_sub_epi16(u30, u25);
-  v26 = _mm_sub_epi16(u29, u26);
-  v27 = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 8x16 block, input __m128i in[16], output __m128i in[32]
-static void idct32_8x32_135(__m128i *in /*in[32]*/) {
-  __m128i out[32];
-  idct32_8x32_quarter_1_2(in, out);
-  idct32_8x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  int j = 0;
-  while (j < 32) {
-    in[j] = _mm_adds_epi16(in[j], final_rounding);
-    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
-    in[j] = _mm_srai_epi16(in[j], 6);
-    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
-    RECON_AND_STORE(dst, in[j]);
-    dst += stride;
-    RECON_AND_STORE(dst, in[j + 1]);
-    dst += stride;
-    j += 2;
-  }
-}
-
-static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
-                                   int stride) {
-  store_buffer_8x32(in0, dest, stride);
-  store_buffer_8x32(in1, dest + 8, stride);
-}
-
-static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
-  idct32_8x32_135(col0);
-  idct32_8x32_135(col1);
-}
-
-typedef enum { left_16, right_16 } ColsIndicator;
-
-static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
-                                     ColsIndicator cols) {
-  switch (cols) {
-    case left_16: {
-      int i;
-      array_transpose_16x16(in0, in1);
-      for (i = 0; i < 16; ++i) {
-        store[i] = in0[16 + i];
-        store[16 + i] = in1[16 + i];
-      }
-      break;
-    }
-    case right_16: {
-      array_transpose_16x16_2(store, &store[16], in0, in1);
-      break;
-    }
-    default: { assert(0); }
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  // Each array represents an 8x32 block
-  __m128i col0[32], col1[32];
-  // This array represents a 16x16 block
-  __m128i temp[32];
-
-  // Load input data. Only need to load the top left 16x16 block.
-  load_buffer_16x16(input, col0, col1);
-
-  // columns
-  array_transpose_16x16(col0, col1);
-  idct32_135(col0, col1);
-
-  // rows
-  transpose_and_copy_16x16(col0, col1, temp, left_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest, stride);
-
-  transpose_and_copy_16x16(col0, col1, temp, right_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest + 16, stride);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[16]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-    const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-    const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-    const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[16]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-    const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-    const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-    const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-    const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-    const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-    const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-    const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  out[4] = _mm_sub_epi16(u19, u20);
-  out[5] = _mm_sub_epi16(u18, u21);
-  out[6] = _mm_sub_epi16(u17, u22);
-  out[7] = _mm_sub_epi16(u16, u23);
-
-  out[8] = _mm_sub_epi16(u31, u24);
-  out[9] = _mm_sub_epi16(u30, u25);
-  out[10] = _mm_sub_epi16(u29, u26);
-  out[11] = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_full_8x32_quarter_1(in, temp);
-  idct32_full_8x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
-                             __m128i *out /*out[32]*/) {
-  __m128i temp[32];
-  idct32_full_8x32_quarter_1_2(in, temp);
-  idct32_full_8x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = load_input_data(input);
-    in[i + 8] = load_input_data(input + 8);
-    in[i + 16] = load_input_data(input + 16);
-    in[i + 24] = load_input_data(input + 24);
-    input += 32;
-  }
-}
-
-void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                  int stride) {
-  __m128i col[128], in[32];
-  int i, j;
-
-  // rows
-  for (i = 0; i < 4; ++i) {
-    load_buffer_8x32(input, in);
-    input += 32 << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    idct32_full_8x32(in, col + (i << 5));
-  }
-
-  // columns
-  for (i = 0; i < 4; ++i) {
-    j = i << 3;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    idct32_full_8x32(in, in);
-    store_buffer_8x32(in, dest, stride);
-    dest += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
index f0668e6f3..0bc841a7a 100644
--- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -85,15 +85,10 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_HIGHBITDEPTH
   mova            m0,        [inputq +  0]
   packssdw        m0,        [inputq + 16]
   mova            m1,        [inputq + 32]
   packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
new file mode 100644
index 000000000..c3c88245a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 4);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 4) {
+    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    x0 = xx_loadl_32(b + 0 * b_stride);
+    x1 = xx_loadl_32(b + 1 * b_stride);
+    x2 = xx_loadl_32(b + 2 * b_stride);
+    x3 = xx_loadl_32(b + 3 * b_stride);
+    x_lo = _mm_unpacklo_epi32(x0, x1);
+    x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    __m128i sad4x4 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad4x4);
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 8);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 2) {
+    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+    __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+    x0 = xx_loadl_64(b + 0 * b_stride);
+    x1 = xx_loadl_64(b + 1 * b_stride);
+
+    __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+    __m128i sad8x2 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad8x2);
+
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i;
+  assert(width == 16);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    __m128i x = xx_loadu_128(a);
+    __m128i y = xx_loadu_128(b);
+
+    __m128i sad16x1 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad16x1);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 32);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 2; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad32_half = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad32_half);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 64);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 4; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 128);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 8; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+#define jnt_sadMxN_sse2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+#define jnt_sadMxN_avx2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+/* clang-format off */
+jnt_sadMxN_sse2(128, 128)
+jnt_sadMxN_sse2(128, 64)
+jnt_sadMxN_sse2(64, 128)
+jnt_sadMxN_sse2(64, 64)
+jnt_sadMxN_sse2(64, 32)
+jnt_sadMxN_sse2(32, 64)
+jnt_sadMxN_sse2(32, 32)
+jnt_sadMxN_sse2(32, 16)
+jnt_sadMxN_sse2(16, 32)
+jnt_sadMxN_sse2(16, 16)
+jnt_sadMxN_sse2(16, 8)
+jnt_sadMxN_sse2(8, 16)
+jnt_sadMxN_sse2(8, 8)
+jnt_sadMxN_sse2(8, 4)
+jnt_sadMxN_sse2(4, 8)
+jnt_sadMxN_sse2(4, 4)
+jnt_sadMxN_sse2(4, 16)
+jnt_sadMxN_sse2(16, 4)
+jnt_sadMxN_sse2(8, 32)
+jnt_sadMxN_sse2(32, 8)
+jnt_sadMxN_sse2(16, 64)
+jnt_sadMxN_sse2(64, 16)
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 000000000..9801e285c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = _mm_setzero_si128();
+
+        // avoid load undefined memory
+        if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8);
+        __m128i source = _mm_unpacklo_epi64(source_low, source_hi);
+
+        // shuffle to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
+
+static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                 int width, int height, const uint8_t *ref,
+                                 int ref_stride,
+                                 const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      const uint8_t *row0 = ref + 0 * ref_stride;
+      const uint8_t *row1 = ref + 1 * ref_stride;
+      const uint8_t *row2 = ref + 2 * ref_stride;
+      const uint8_t *row3 = ref + 3 * ref_stride;
+
+      __m128i p0 =
+          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+                        row3[0], row3[1], row3[2], row3[3]);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
+      const uint8_t *b, int b_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                        \
+    uint8_t temp2[H * W];                                                \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
+                                                                         \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
+                                                                         \
+    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
+                                jcp_param);                              \
+                                                                         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
+  }
+
+JNT_SUBPIX_AVG_VAR(128, 128)
+JNT_SUBPIX_AVG_VAR(128, 64)
+JNT_SUBPIX_AVG_VAR(64, 128)
+JNT_SUBPIX_AVG_VAR(64, 64)
+JNT_SUBPIX_AVG_VAR(64, 32)
+JNT_SUBPIX_AVG_VAR(32, 64)
+JNT_SUBPIX_AVG_VAR(32, 32)
+JNT_SUBPIX_AVG_VAR(32, 16)
+JNT_SUBPIX_AVG_VAR(16, 32)
+JNT_SUBPIX_AVG_VAR(16, 16)
+JNT_SUBPIX_AVG_VAR(16, 8)
+JNT_SUBPIX_AVG_VAR(8, 16)
+JNT_SUBPIX_AVG_VAR(8, 8)
+JNT_SUBPIX_AVG_VAR(8, 4)
+JNT_SUBPIX_AVG_VAR(4, 8)
+JNT_SUBPIX_AVG_VAR(4, 4)
+JNT_SUBPIX_AVG_VAR(4, 16)
+JNT_SUBPIX_AVG_VAR(16, 4)
+JNT_SUBPIX_AVG_VAR(8, 32)
+JNT_SUBPIX_AVG_VAR(32, 8)
+JNT_SUBPIX_AVG_VAR(16, 64)
+JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
index bf8150e2a..18862dd3e 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -11,13 +11,14 @@
 
 #include <immintrin.h> /* AVX2 */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
-void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
+void aom_lpf_horizontal_16_avx2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
@@ -368,7 +369,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
                                      const unsigned char *_blimit,
                                      const unsigned char *_limit,
                                      const unsigned char *_thresh) {
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index 8343dbbed..f1eac233b 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -11,7 +11,9 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
 
@@ -19,1047 +21,1016 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-#if CONFIG_PARALLEL_DEBLOCKING
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK4                                                      \
-  do {                                                                        \
-    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
-    __m128i flat = abs_diff(q1p1, q0p0);                                      \
-    /* abs(p1 - q1), abs(p0 - q0) */                                          \
-    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
-    __m128i abs_p0q0, abs_p1q1;                                               \
-                                                                              \
-    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
-    hev =                                                                     \
-        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
-    hev = _mm_packs_epi16(hev, hev);                                          \
-                                                                              \
-    /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
-    /*                                  p1, p0, q0, q1); */                   \
-    abs_p0q0 =                                                                \
-        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
-    abs_p1q1 =                                                                \
-        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
-    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
-    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
-    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
-    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
-    mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
-    mask = _mm_cmpeq_epi8(mask, zero);                                        \
-    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
-  } while (0)
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK                                                       \
-  do {                                                                        \
-    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
-    __m128i flat = abs_diff(q1p1, q0p0);                                      \
-    /* abs(p1 - q1), abs(p0 - q0) */                                          \
-    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
-    __m128i abs_p0q0, abs_p1q1, work;                                         \
-                                                                              \
-    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
-    hev =                                                                     \
-        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
-    hev = _mm_packs_epi16(hev, hev);                                          \
-                                                                              \
-    /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
-    /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
-    abs_p0q0 =                                                                \
-        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
-    abs_p1q1 =                                                                \
-        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
-    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
-    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
-    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
-    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
-    /* abs(p3 - p2), abs(p2 - p1) */                                          \
-    work = abs_diff(p3p2, p2p1);                                              \
-    flat = _mm_max_epu8(work, flat);                                          \
-    /* abs(q3 - q2), abs(q2 - q1) */                                          \
-    work = abs_diff(q3q2, q2q1);                                              \
-    flat = _mm_max_epu8(work, flat);                                          \
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
-    mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
-    mask = _mm_cmpeq_epi8(mask, zero);                                        \
-    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
-  } while (0)
-
-#define FILTER4                                                             \
-  do {                                                                      \
-    const __m128i t3t4 =                                                    \
-        _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
-    const __m128i t80 = _mm_set1_epi8(0x80);                                \
-    __m128i filter, filter2filter1, work;                                   \
-                                                                            \
-    ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
-    qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
-                                                                            \
-    /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
-    work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
-    filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
-    /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
-    filter = _mm_subs_epi8(filter, work);                                   \
-    filter = _mm_subs_epi8(filter, work);                                   \
-    filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
-    filter = _mm_and_si128(filter, mask); /* & mask */                      \
-    filter = _mm_unpacklo_epi64(filter, filter);                            \
-                                                                            \
-    /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
-    /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
-    filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
-    filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
-    filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
-    filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
-    filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
-    filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
-                                                                            \
-    /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
-    filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
-    filter = _mm_unpacklo_epi8(filter, filter);                             \
-    filter = _mm_srai_epi16(filter, 9); /* round */                         \
-    filter = _mm_packs_epi16(filter, filter);                               \
-    filter = _mm_andnot_si128(hev, filter);                                 \
-                                                                            \
-    hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
-    filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
-                                                                            \
-    /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
-    qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
-    /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
-    ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
-    qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
-    ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
-  } while (0)
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                          __m128i *hev, __m128i *mask,
+                                          __m128i *qs1qs0, __m128i *ps1ps0) {
+  const __m128i t3t4 =
+      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8(0x80);
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi64(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);
+  filter = _mm_srai_epi16(filter, 9); /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+
+  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  __m128i flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+
+  /* const int8_t mask = filter_mask2(*limit, *blimit, */
+  /*                                  p1, p0, q0, q1); */
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi64(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
 
 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
                                const uint8_t *_blimit, const uint8_t *_limit,
                                const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3p2, p2p1, q3q2, q2q1;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-#if !CONFIG_PARALLEL_DEBLOCKING
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-#if !CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK;
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK4;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  FILTER4;
-
-#if CONFIG_PARALLEL_DEBLOCKING
-  *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 8);
-  *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0);
-
-  *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 8);
-  *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0);
-#else
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
-#endif
+
+  __m128i qs1qs0, ps1ps0;
+  __m128i p1, p0, q0, q1;
+
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+  xx_storel_32(s - 1 * p, ps1ps0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
+  xx_storel_32(s + 0 * p, qs1qs0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
 }
 
 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
                              const uint8_t *_blimit, const uint8_t *_limit,
                              const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
+  __m128i p1p0, q1q0;
+  __m128i p1, p0, q0, q1;
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
   __m128i x0, x1, x2, x3;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3p2, p2p1, q3q2, q2q1;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
+  __m128i d0, d1, d2, d3;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 
-  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
-
-  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
-  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
-  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
-  // Transpose 8x8
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
-  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  x0 = _mm_unpacklo_epi16(x2, x3);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
-
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
-  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  x2 = _mm_unpackhi_epi16(x2, x3);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
-  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
-  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-#if !CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK;
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK4;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  FILTER4;
+  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
 
   // Transpose 8x4 to 4x8
-  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
-  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
-  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
-  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
-  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
-  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
-#endif
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
-
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-#endif
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
 }
 
-static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num,
-                                       uint8_t *s) {
-#if CONFIG_PARALLEL_DEBLOCKING
-  *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x);
-  const __m128i hi = _mm_srli_si128(*x, 8);
-  *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi);
-#else
-  _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x);
-  _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x));
-#endif
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+  xx_storel_32(s - (num + 1) * p, x);
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
 }
 
-void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
   __m128i mask, hev, flat, flat2;
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
   __m128i abs_p1p0;
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
 
   {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    __m128i fe, ff, work;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     fe = _mm_set1_epi8(0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, *thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
   }
 
-  // lp filter
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+  // loopfilter done
+
+  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+  {
+    __m128i work;
+    flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
   {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi16(0x1);
-    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-    __m128i qs0 = _mm_xor_si128(p0q0, t80);
-    __m128i qs1 = _mm_xor_si128(p1q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, qs0ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 0xB);
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 0xB);
-
-    // Filter1 >> 3
-    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi16(filter1, t1);
-    filt = _mm_srai_epi16(filt, 1);
-    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
-                            filt);
-    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-      flat = _mm_max_epu8(abs_p1p0, flat);
-      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
-
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
-      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
-
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
-      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-      __m128i pixelFilter_p, pixelFilter_q;
-      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                                    _mm_add_epi16(p4_16, p3_16));
-      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                                    _mm_add_epi16(q4_16, q3_16));
-
-      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =
-          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-      pixetFilter_p2p1p0 = _mm_add_epi16(
-          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
-      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(p7_16, p7_16);
-      sum_q7 = _mm_add_epi16(q7_16, q7_16);
-      sum_p3 = _mm_add_epi16(p3_16, p3_16);
-      sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
-      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
-      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    flat = _mm_shuffle_epi32(flat, 68);
-    flat2 = _mm_shuffle_epi32(flat2, 68);
-
-    q2p2 = _mm_andnot_si128(flat, q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    q6p6 = _mm_andnot_si128(flat2, q6p6);
-    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    store_buffer_horz_8(&q6p6, p, 6, s);
-
-    q5p5 = _mm_andnot_si128(flat2, q5p5);
-    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    store_buffer_horz_8(&q5p5, p, 5, s);
-
-    q4p4 = _mm_andnot_si128(flat2, q4p4);
-    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    store_buffer_horz_8(&q4p4, p, 4, s);
-
-    q3p3 = _mm_andnot_si128(flat2, q3p3);
-    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    store_buffer_horz_8(&q3p3, p, 3, s);
-
-    q2p2 = _mm_andnot_si128(flat2, q2p2);
-    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    store_buffer_horz_8(&q2p2, p, 2, s);
-
-    q1p1 = _mm_andnot_si128(flat2, q1p1);
-    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    store_buffer_horz_8(&q1p1, p, 1, s);
-
-    q0p0 = _mm_andnot_si128(flat2, q0p0);
-    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    store_buffer_horz_8(&q0p0, p, 0, s);
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pixelFilter_p, pixelFilter_q;
+    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+    __m128i sum_p6, sum_q6;
+    __m128i sum_p3, sum_q3, res_p, res_q;
+
+    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+    pixelFilter_p =
+        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+    pixetFilter_p2p1p0 = _mm_add_epi16(
+        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+                                    _mm_add_epi16(p1_16, q0_16))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+                                    _mm_add_epi16(p0_16, q1_16))),
+        4);
+    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(p6_16, p6_16);
+    sum_q6 = _mm_add_epi16(q6_16, q6_16);
+    sum_p3 = _mm_add_epi16(p3_16, p3_16);
+    sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+        4);
+    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+        4);
+    flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+        4);
+    flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+        4);
+    flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+        4);
+    flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
   }
-}
+  // wide flat
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
-                                       const __m128i *const a1,
-                                       const __m128i *const a2,
-                                       const __m128i *const s1,
-                                       const __m128i *const s2) {
-  __m128i x = _mm_add_epi16(*a1, *total);
-  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
-  return x;
-}
+  flat = _mm_shuffle_epi32(flat, 68);
+  flat2 = _mm_shuffle_epi32(flat2, 68);
+
+  *q2p2 = _mm_andnot_si128(flat, *q2p2);
+  flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+  *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+  qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+  flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+  *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+  qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+  flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+  *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+  *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+  flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+  *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+  *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+  flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+  *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+  *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+  flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+  *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+  *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+  flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+  *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
 
-static INLINE __m128i filter8_mask(const __m128i *const flat,
-                                   const __m128i *const other_filt,
-                                   const __m128i *const f8_lo,
-                                   const __m128i *const f8_hi) {
-  const __m128i f8 =
-      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
-  const __m128i result = _mm_and_si128(*flat, f8);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+  *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+  flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+  *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+  *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+  flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+  *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
 }
 
-static INLINE __m128i filter16_mask(const __m128i *const flat,
-                                    const __m128i *const other_filt,
-                                    const __m128i *const f_lo,
-                                    const __m128i *const f_hi) {
-  const __m128i f =
-      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
-  const __m128i result = _mm_and_si128(*flat, f);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  store_buffer_horz_8(q0p0, p, 0, s);
+  store_buffer_horz_8(q1p1, p, 1, s);
+  store_buffer_horz_8(q2p2, p, 2, s);
+  store_buffer_horz_8(q3p3, p, 3, s);
+  store_buffer_horz_8(q4p4, p, 4, s);
+  store_buffer_horz_8(q5p5, p, 5, s);
 }
 
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
 
-static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
-                                        int p, int offset, uint8_t *s) {
-  int i;
-  if (pixel_num == FOUR_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]);
-    }
-  }
-  if (pixel_num == EIGHT_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]);
-    }
-  }
-  if (pixel_num == SIXTEEN_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]);
-    }
-  }
-}
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
 
-static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
-                                             unsigned char *s, int p,
-                                             const unsigned char *_blimit,
-                                             const unsigned char *_limit,
-                                             const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-
-  __m128i op2, op1, op0, oq0, oq1, oq2;
-
-  __m128i max_abs_p1p0q1q0;
-
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   {
-    const __m128i abs_p1p0 = abs_diff(p1, p0);
-    const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-    __m128i abs_p0q0 = abs_diff(p0, q0);
-    __m128i abs_p1q1 = abs_diff(p1, q1);
-    __m128i work;
-    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+
+    work = abs_diff(q2p2, q1p1);
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-  }
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
 
-  {
-    __m128i work;
-    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
-    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
-    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
-    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
   }
 
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // filter4
+  // 5 tap filter
   {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    op1 = _mm_xor_si128(p1, t80);
-    op0 = _mm_xor_si128(p0, t80);
-    oq0 = _mm_xor_si128(q0, t80);
-    oq1 = _mm_xor_si128(q1, t80);
-
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
-    work_a = _mm_subs_epi8(oq0, op0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
-    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
-    // loopfilter done
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // filter8
-    {
-      const __m128i four = _mm_set1_epi16(4);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      __m128i f8_lo, f8_hi;
-
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
-                            _mm_add_epi16(p3_lo, p2_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
-                            _mm_add_epi16(p2_lo, p1_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
-                            _mm_add_epi16(p3_hi, p2_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
-                            _mm_add_epi16(p2_hi, p1_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
-      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
-      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
-      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
-      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
-      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
-      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
-      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
-      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
-      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
-      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
-      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
-      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
-      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
-      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
-      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
-      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
-      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
-      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
-      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
-      __m128i f_lo;
-      __m128i f_hi;
-
-      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
-      f_lo =
-          _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
-                           _mm_add_epi16(p2_lo, p1_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
-      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
-      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
-      f_hi =
-          _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
-                           _mm_add_epi16(p2_hi, p1_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
-      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
-      __m128i x[14];
-      x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
-      x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
-      x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
-      x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
-      x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
-      x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
-      x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
-      x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
-      x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
-      x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
-      x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
-      x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
-      x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
-      x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-
-      store_buffer_horz_16(pixel_num, x, p, 6, s);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_add_epi16(q1_16, q2_16);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
   }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  *q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  *p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
 }
 
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_a, op2, oq2;
 
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
   {
     // filter_mask and hev_mask
+
+    // considering sse doesn't have unsigned elements comparison the idea is to
+    // find at least one case when X > limit, it means the corresponding  mask
+    // bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
     const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
     abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
 
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, *thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -1067,424 +1038,215 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
     // mask |= (abs(q1 - q0) > limit) * -1;
 
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
 
     // flat_mask4
 
     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     flat = _mm_max_epu8(abs_p1p0, flat);
+
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
   }
 
+  // filter8
   {
     const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-    }
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 11);
-    filter1 = _mm_packs_epi16(filter1, filter1);
-
-    // Filter2 >> 3
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 11);
-    filter2 = _mm_packs_epi16(filter2, zero);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    filt = _mm_unpacklo_epi8(zero, filt);
-    filt = _mm_srai_epi16(filt, 9);
-    filt = _mm_packs_epi16(filt, zero);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-#if CONFIG_PARALLEL_DEBLOCKING
-    *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2);
-    *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1);
-    *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0);
-    *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0);
-    *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1);
-    *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2);
-#else
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-#endif
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
   }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+  work_a = _mm_andnot_si128(flat, *q2);
+  q2_16 = _mm_and_si128(flat, oq2);
+  *q2_out = _mm_or_si128(work_a, q2_16);
+
+  work_a = _mm_andnot_si128(flat, *p2);
+  p2_16 = _mm_and_si128(flat, op2);
+  *p2_out = _mm_or_si128(work_a, p2_16);
 }
 
-void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING
-  lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
-#else
-  lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh);
-#endif
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0, p2_out, q2_out;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+  xx_storel_32(s - 3 * p, p2_out);
+  xx_storel_32(s + 2 * p, q2_out);
 }
 
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
                          _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
                          _mm_load_si128((const __m128i *)_thresh1));
 
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-    __m128i work;
+  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
 
-    // filter_mask and hev_mask
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
 
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
+  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
 
-    // flat_mask4
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    int i = 0;
-
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-  }
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0, p2_out, q2_out;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
 }
 
 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1494,449 +1256,405 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit1,
                                     const unsigned char *_limit1,
                                     const unsigned char *_thresh1) {
+  __m128i p1, p0, q0, q1;
+  __m128i qs1qs0, ps1ps0;
+
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
                          _mm_load_si128((const __m128i *)_blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
                          _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3, p2, q2, q3;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p1, p0, q0, q1;
-  __m128i mask, hev, flat;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-#if !CONFIG_PARALLEL_DEBLOCKING
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // filter_mask and hev_mask
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-#if !CONFIG_PARALLEL_DEBLOCKING
-    __m128i work;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-#if !CONFIG_PARALLEL_DEBLOCKING
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
 
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  }
-}
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
 
-static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                 int in_p, unsigned char *out, int out_p) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
-
-  // 2-way interleave w/hoisting of unpacks
-  x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
-  x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
-
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
-  x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
-
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
-  x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
-
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
-  x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
-  x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
-
-  x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
-  x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
-  x5 = _mm_unpacklo_epi16(x2, x3);                // 10
-
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
-  x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
-
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
-  x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
-  x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
-
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
-  x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
-  x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
-
-  x6 = _mm_unpacklo_epi32(x4, x5);     // 13
-  x7 = _mm_unpackhi_epi32(x4, x5);     // 14
-  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
-  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
 
-  // Store first 4-line result
-  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
 
-  x4 = _mm_unpackhi_epi16(x0, x1);
-  x5 = _mm_unpackhi_epi16(x2, x3);
-  x12 = _mm_unpackhi_epi16(x8, x9);
-  x13 = _mm_unpackhi_epi16(x10, x11);
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
-  // Store second 4-line result
-  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
 }
 
-#if CONFIG_PARALLEL_DEBLOCKING
-#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
-#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
-#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
-#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
-#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
-#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
-enum { ROTATE_DWORD_RIGHT = 0x39 };
-static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
-                                 const uint8_t *pSrc,
-                                 const ptrdiff_t srcStride) {
-  for (uint32_t idx = 0; idx < 2; idx += 1) {
-    __m128i r0, r1, r2, r3;
-    // load data
-    r0 = movq(pSrc);
-    r1 = movq(pSrc + srcStride);
-    r2 = movq(pSrc + srcStride * 2);
-    r3 = movq(pSrc + srcStride * 3);
-    // transpose
-    r0 = punpcklbw(r0, r1);
-    r2 = punpcklbw(r2, r3);
-    r1 = punpckhwd(r0, r2);
-    r0 = punpcklwd(r0, r2);
-    // store data
-    movd(pDst, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 2, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 3, r0);
-    movd(pDst + dstStride * 4, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 5, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 6, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 7, r1);
-    // advance the pointers
-    pDst += dstStride * 8;
-    pSrc += 8;
-  }
-}
-
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-static INLINE void transpose(unsigned char *src[], int in_p,
-                             unsigned char *dst[], int out_p,
-                             int num_8x8_to_transpose) {
-  int idx8x8 = 0;
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i p0, q0, q1, p1;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    unsigned char *in = src[idx8x8];
-    unsigned char *out = dst[idx8x8];
-
-    x0 =
-        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    x1 =
-        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-    x0 = _mm_unpacklo_epi8(x0, x1);
-
-    x2 =
-        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    x3 =
-        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-    x1 = _mm_unpacklo_epi8(x2, x3);
-
-    x4 =
-        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    x5 =
-        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-    x2 = _mm_unpacklo_epi8(x4, x5);
-
-    x6 =
-        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    x7 =
-        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-    x3 = _mm_unpacklo_epi8(x6, x7);
-
-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-    x4 = _mm_unpacklo_epi16(x0, x1);
-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-    x5 = _mm_unpacklo_epi16(x2, x3);
-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0 * out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1 * out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2 * out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3 * out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
-
-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi16(x0, x1);
-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi16(x2, x3);
-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4 * out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5 * out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 6 * out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7 * out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
-}
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i qs1qs0, ps1ps0;
 
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  unsigned char *src[2];
-  unsigned char *dst[2];
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
 
-  // Loop filtering
-  aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-#else  // CONFIG_PARALLEL_DEBLOCKING
-  transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-}
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
 
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *blimit,
-                             const unsigned char *limit,
-                             const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
-  unsigned char *src[1];
-  unsigned char *dst[1];
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
 
-  transpose(src, p, dst, 8, 1);
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  // Loop filtering
-  aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
+  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+                        &q1);
 
-  // Transpose back
-  transpose(src, 8, dst, p, 1);
-}
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
+  p1 = _mm_srli_si128(ps1ps0, 8);
+  q1 = _mm_srli_si128(qs1qs0, 8);
 
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+                        &d5, &d6, &d7);
 
-  // Loop filtering
-  aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x2, x1, x0, x3;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+
+  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
 
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+                        &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
 }
 
-void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
-                              const unsigned char *blimit,
-                              const unsigned char *limit,
-                              const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  __m128i p2, p0, q0, q2;
+  __m128i x2, x1, x0, x3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+  // Loop filtering
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
+                      &q2, &blimit, &limit, &thresh);
 
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+                        &d2, &d3);
 
-  // Loop filtering
-  aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
 
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
 
-  // Transpose back
-  transpose(src, 8, dst, p, 2);
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d1, d3, d5, d7;
+  __m128i q1q0, p1p0;
+  __m128i p2, p1, q1, q2;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
+                      &p1p0, &p2, &q2, &blimit, &limit, &thresh);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
+                    &d4d5, &d6d7);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
 }
 
-void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
-
-  // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+                              const unsigned char *_blimit,
+                              const unsigned char *_limit,
+                              const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3, x2, x1, x0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+  __m128i q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i p0_out, p1_out, p2_out, p3_out;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6,
+                        &p7);
+
+  x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6,
+                        &q7);
+
+  q6p6 = _mm_unpacklo_epi64(p1, q6);
+  q5p5 = _mm_unpacklo_epi64(p2, q5);
+  q4p4 = _mm_unpacklo_epi64(p3, q4);
+  q3p3 = _mm_unpacklo_epi64(p4, q3);
+  q2p2 = _mm_unpacklo_epi64(p5, q2);
+  q1p1 = _mm_unpacklo_epi64(p6, q1);
+  q0p0 = _mm_unpacklo_epi64(p7, q0);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                        &p0_out, &p1_out, &p2_out, &p3_out);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2,
+                        &q3);
+
+  _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out);
+
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), q3);
+}
 
-  // Loop filtering
-  aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+  __m128i q0, q1, q2, q3, q7;
+  __m128i p0p1, p2p3, p4p5, p6p7;
+
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
 
-  // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+  q7 = _mm_srli_si128(d14d15, 8);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
 }
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
index 027c890dc..c6b6469b4 100644
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -14,117 +14,202 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-
-static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
-                                    int out_p, int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    uint16_t *in = src[idx8x8];
-    uint16_t *out = dst[idx8x8];
-
-    p0 =
-        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    p1 =
-        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    p2 =
-        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    p3 =
-        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    p4 =
-        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    p5 =
-        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    p6 =
-        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    p7 =
-        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-    // 00 10 01 11 02 12 03 13
-    x0 = _mm_unpacklo_epi16(p0, p1);
-    // 20 30 21 31 22 32 23 33
-    x1 = _mm_unpacklo_epi16(p2, p3);
-    // 40 50 41 51 42 52 43 53
-    x2 = _mm_unpacklo_epi16(p4, p5);
-    // 60 70 61 71 62 72 63 73
-    x3 = _mm_unpacklo_epi16(p6, p7);
-    // 00 10 20 30 01 11 21 31
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 40 50 60 70 41 51 61 71
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 00 10 20 30 40 50 60 70
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 01 11 21 31 41 51 61 71
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
-    // 00 10 20 30 40 50 60 70
-    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
-    // 01 11 21 31 41 51 61 71
-
-    // 02 12 22 32 03 13 23 33
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 42 52 62 72 43 53 63 73
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 02 12 22 32 42 52 62 72
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
-    // 02 12 22 32 42 52 62 72
-    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
-    // 03 13 23 33 43 53 63 73
-
-    // 04 14 05 15 06 16 07 17
-    x0 = _mm_unpackhi_epi16(p0, p1);
-    // 24 34 25 35 26 36 27 37
-    x1 = _mm_unpackhi_epi16(p2, p3);
-    // 44 54 45 55 46 56 47 57
-    x2 = _mm_unpackhi_epi16(p4, p5);
-    // 64 74 65 75 66 76 67 77
-    x3 = _mm_unpackhi_epi16(p6, p7);
-    // 04 14 24 34 05 15 25 35
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 44 54 64 74 45 55 65 75
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 04 14 24 34 44 54 64 74
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 05 15 25 35 45 55 65 75
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
-    // 04 14 24 34 44 54 64 74
-    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
-    // 05 15 25 35 45 55 65 75
-
-    // 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 06 16 26 36 46 56 66 76
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
-    // 06 16 26 36 46 56 66 76
-    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
-    // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
+#include "config/aom_config.h"
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *d0, __m128i *d1,
+                                            __m128i *d2, __m128i *d3,
+                                            __m128i *d4, __m128i *d5) {
+  __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+  // 00 01 02 03 04 05 xx xx
+  // 10 11 12 13 14 15 xx xx
+  // 20 21 22 23 24 25 xx xx
+  // 30 31 32 33 34 35 xx xx
+  // 40 41 42 43 44 45 xx xx
+  // 50 51 52 53 54 55 xx xx
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
+  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
+  *d1 = _mm_unpackhi_epi64(ww0,
+                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  *d2 = _mm_unpacklo_epi64(ww0,
+                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
+
+  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
+  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
+  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
+
+  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
+
+  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
+  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
+  *d5 = _mm_unpackhi_epi64(ww0,
+                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                                    __m128i *x2, __m128i *x3,
+                                                    __m128i *d0, __m128i *d1,
+                                                    __m128i *d2, __m128i *d3) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+
+  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
+  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
+  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
+  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+                                                     __m128i *x2, __m128i *x3,
+                                                     __m128i *d4, __m128i *d5,
+                                                     __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, ww2, ww3;
+  __m128i zero = _mm_setzero_si128();
+
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+
+  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
+  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
+
+  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
+  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
+  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
+  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
 }
 
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
-                                        uint16_t *out, int out_p) {
-  uint16_t *src0[1];
-  uint16_t *src1[1];
-  uint16_t *dest0[1];
-  uint16_t *dest1[1];
-  src0[0] = in0;
-  src1[0] = in1;
-  dest0[0] = out;
-  dest1[0] = out + 8;
-  highbd_transpose(src0, in_p, dest0, out_p, 1);
-  highbd_transpose(src1, in_p, dest1, out_p, 1);
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3,
+                                                __m128i *d4, __m128i *d5,
+                                                __m128i *d6, __m128i *d7) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
 }
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *x4, __m128i *x5,
+                                                __m128i *x6, __m128i *x7,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+                                                 __m128i *x2, __m128i *x3,
+                                                 __m128i *x4, __m128i *x5,
+                                                 __m128i *x6, __m128i *x7,
+                                                 __m128i *d4, __m128i *d5,
+                                                 __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
+  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
 #endif  // _AOM_DSP_X86_LPF_COMMON_X86_H
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 2536f91d2..1f42eec2f 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -12,8 +12,9 @@
 #include <stdio.h>
 #include <tmmintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
@@ -75,11 +76,9 @@ static INLINE unsigned int masked_sad4xh_ssse3(
                                  ref_stride, msk, msk_stride, n);             \
   }
 
-#if CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(128, 128)
 MASKSADMXN_SSSE3(128, 64)
 MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(64, 64)
 MASKSADMXN_SSSE3(64, 32)
 MASKSADMXN_SSSE3(32, 64)
@@ -93,18 +92,12 @@ MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)
 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 MASKSAD4XN_SSSE3(16)
 MASKSADMXN_SSSE3(16, 4)
 MASKSAD8XN_SSSE3(32)
 MASKSADMXN_SSSE3(32, 8)
 MASKSADMXN_SSSE3(16, 64)
 MASKSADMXN_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(32, 128)
-MASKSADMXN_SSSE3(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
@@ -239,7 +232,6 @@ static INLINE unsigned int masked_sad4xh_ssse3(
   return (sad + 31) >> 6;
 }
 
-#if CONFIG_HIGHBITDEPTH
 // For width a multiple of 8
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -277,11 +269,9 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
                                         ref8, ref_stride, msk, msk_stride, n); \
   }
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(128, 128)
 HIGHBD_MASKSADMXN_SSSE3(128, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(64, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 64)
@@ -295,18 +285,12 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 4)
 HIGHBD_MASKSAD4XN_SSSE3(8)
 HIGHBD_MASKSAD4XN_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASKSAD4XN_SSSE3(16)
 HIGHBD_MASKSADMXN_SSSE3(16, 4)
 HIGHBD_MASKSADMXN_SSSE3(8, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 8)
 HIGHBD_MASKSADMXN_SSSE3(16, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(32, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -424,5 +408,3 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
   int sad = _mm_cvtsi128_si32(res);
   return (sad + 31) >> 6;
 }
-
-#endif
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
index 3ffe132be..d7dbefd7d 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -13,13 +13,15 @@
 #include <string.h>
 #include <tmmintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/blend.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
 
 // For width a multiple of 16
 static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
@@ -108,11 +110,9 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
   }
 
-#if CONFIG_EXT_PARTITION
 MASK_SUBPIX_VAR_SSSE3(128, 128)
 MASK_SUBPIX_VAR_SSSE3(128, 64)
 MASK_SUBPIX_VAR_SSSE3(64, 128)
-#endif
 MASK_SUBPIX_VAR_SSSE3(64, 64)
 MASK_SUBPIX_VAR_SSSE3(64, 32)
 MASK_SUBPIX_VAR_SSSE3(32, 64)
@@ -126,18 +126,12 @@ MASK_SUBPIX_VAR8XH_SSSE3(8)
 MASK_SUBPIX_VAR8XH_SSSE3(4)
 MASK_SUBPIX_VAR4XH_SSSE3(8)
 MASK_SUBPIX_VAR4XH_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 MASK_SUBPIX_VAR4XH_SSSE3(16)
 MASK_SUBPIX_VAR_SSSE3(16, 4)
 MASK_SUBPIX_VAR8XH_SSSE3(32)
 MASK_SUBPIX_VAR_SSSE3(32, 8)
 MASK_SUBPIX_VAR_SSSE3(64, 16)
 MASK_SUBPIX_VAR_SSSE3(16, 64)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR_SSSE3(128, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 128)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE __m128i filter_block(const __m128i a, const __m128i b,
                                    const __m128i filter) {
@@ -523,7 +517,6 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
-#if CONFIG_HIGHBITDEPTH
 // For width a multiple of 8
 static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
                                    int xoffset, int yoffset, uint16_t *dst,
@@ -695,11 +688,9 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
-#endif
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
@@ -713,18 +704,12 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32)
-#endif
-#endif
 
 static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
                                           const __m128i filter) {
@@ -1040,4 +1025,40 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
-#endif
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                              int width, int height, const uint8_t *ref,
+                              int ref_stride, const uint8_t *mask,
+                              int mask_stride, int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  assert(height % 2 == 0);
+  int i = 0;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+                              mask + mask_stride, comp_pred + width);
+      comp_pred += (width << 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  } else {  // width == 32
+    assert(width == 32);
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
+      comp_pred += (width);
+      src0 += (stride0);
+      src1 += (stride1);
+      mask += (mask_stride);
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 000000000..dc41a8342
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+                                           const uint8_t *src1,
+                                           const uint8_t *mask, uint8_t *dst) {
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *mask,
+                                          int mask_stride) {
+  int i = 0;
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    // odd line A
+    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+    // even line B
+    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+    const __m128i ma = _mm_sub_epi8(alpha_max, a);
+    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+    const __m128i round = _mm_packus_epi16(roundA, roundB);
+    // comp_pred's stride == width == 8
+    _mm_store_si128((__m128i *)(comp_pred), round);
+    comp_pred += (8 << 1);
+    src0 += (stride0 << 1);
+    src1 += (stride1 << 1);
+    mask += (mask_stride << 1);
+    i += 2;
+  } while (i < height);
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..8b69606dd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  __m128i dst;
+  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+  return dst;
+}
+
+#endif  // AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
index 73589a32a..a3535f985 100644
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -14,7 +14,7 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
   v_d = _mm_hadd_epi32(v_d, v_d);
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
index 52dd508ec..0338a8c77 100644
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
@@ -24,9 +25,11 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
-                                       const int32_t *wsrc, const int32_t *mask,
-                                       const int height) {
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                                 const int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask,
+                                                 const int height) {
   const int pre_step = pre_stride - 4;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
@@ -59,11 +62,9 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
-                                        const int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, const int width,
-                                        const int height) {
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
   const int pre_step = pre_stride - width;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
@@ -119,11 +120,9 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
     }                                                          \
   }
 
-#if CONFIG_EXT_PARTITION
 OBMCSADWXH(128, 128)
 OBMCSADWXH(128, 64)
 OBMCSADWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 OBMCSADWXH(64, 64)
 OBMCSADWXH(64, 32)
 OBMCSADWXH(32, 64)
@@ -137,25 +136,22 @@ OBMCSADWXH(8, 8)
 OBMCSADWXH(8, 4)
 OBMCSADWXH(4, 8)
 OBMCSADWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 OBMCSADWXH(4, 16)
 OBMCSADWXH(16, 4)
 OBMCSADWXH(8, 32)
 OBMCSADWXH(32, 8)
 OBMCSADWXH(16, 64)
 OBMCSADWXH(64, 16)
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
-                                           const int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           const int height) {
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                                     const int pre_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - 4;
   int n = 0;
@@ -189,11 +185,9 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
-                                            const int pre_stride,
-                                            const int32_t *wsrc,
-                                            const int32_t *mask,
-                                            const int width, const int height) {
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - width;
   int n = 0;
@@ -250,11 +244,9 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
     }                                                             \
   }
 
-#if CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(128, 128)
 HBD_OBMCSADWXH(128, 64)
 HBD_OBMCSADWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(64, 64)
 HBD_OBMCSADWXH(64, 32)
 HBD_OBMCSADWXH(32, 64)
@@ -268,12 +260,9 @@ HBD_OBMCSADWXH(8, 8)
 HBD_OBMCSADWXH(8, 4)
 HBD_OBMCSADWXH(4, 8)
 HBD_OBMCSADWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 HBD_OBMCSADWXH(4, 16)
 HBD_OBMCSADWXH(16, 4)
 HBD_OBMCSADWXH(8, 32)
 HBD_OBMCSADWXH(32, 8)
 HBD_OBMCSADWXH(16, 64)
 HBD_OBMCSADWXH(64, 16)
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 392616af3..571aa770b 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
@@ -128,11 +129,9 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
   }
 
-#if CONFIG_EXT_PARTITION
 OBMCVARWXH(128, 128)
 OBMCVARWXH(128, 64)
 OBMCVARWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 OBMCVARWXH(64, 64)
 OBMCVARWXH(64, 32)
 OBMCVARWXH(32, 64)
@@ -146,24 +145,17 @@ OBMCVARWXH(8, 8)
 OBMCVARWXH(8, 4)
 OBMCVARWXH(4, 8)
 OBMCVARWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 OBMCVARWXH(4, 16)
 OBMCVARWXH(16, 4)
 OBMCVARWXH(8, 32)
 OBMCVARWXH(32, 8)
 OBMCVARWXH(16, 64)
 OBMCVARWXH(64, 16)
-#if CONFIG_EXT_PARTITION
-OBMCVARWXH(32, 128)
-OBMCVARWXH(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
 static INLINE void hbd_obmc_variance_w4(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
@@ -278,8 +270,19 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
   uint64_t sse64 = 0;
   if (w == 4) {
     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
+  } else if (w < 128 || h < 128) {
     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    assert(w == 128 && h == 128);
+
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * w;
+      mask += 64 * w;
+      h -= 64;
+    } while (h > 0);
   }
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
@@ -291,28 +294,23 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
-  if (w == 128) {
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
-                            32);
-      pre8 += 32 * pre_stride;
-      wsrc += 32 * 128;
-      mask += 32 * 128;
-      h -= 32;
-    } while (h > 0);
-  } else if (w == 64 && h >= 128) {
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
-                            64);
-      pre8 += 64 * pre_stride;
-      wsrc += 64 * 64;
-      mask += 64 * 64;
-      h -= 64;
-    } while (h > 0);
-  } else if (w == 4) {
+  int max_pel_allowed_per_ovf = 512;
+  if (w == 4) {
     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
+  } else if (w * h <= max_pel_allowed_per_ovf) {
     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+    assert(max_pel_allowed_per_ovf % w == 0);
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            h_per_ovf);
+      pre8 += h_per_ovf * pre_stride;
+      wsrc += h_per_ovf * w;
+      mask += h_per_ovf * w;
+      h -= h_per_ovf;
+    } while (h > 0);
   }
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
@@ -347,11 +345,9 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-#if CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(128, 128)
 HBD_OBMCVARWXH(128, 64)
 HBD_OBMCVARWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(64, 64)
 HBD_OBMCVARWXH(64, 32)
 HBD_OBMCVARWXH(32, 64)
@@ -365,16 +361,9 @@ HBD_OBMCVARWXH(8, 8)
 HBD_OBMCVARWXH(8, 4)
 HBD_OBMCVARWXH(4, 8)
 HBD_OBMCVARWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 HBD_OBMCVARWXH(4, 16)
 HBD_OBMCVARWXH(16, 4)
 HBD_OBMCVARWXH(8, 32)
 HBD_OBMCVARWXH(32, 8)
 HBD_OBMCVARWXH(16, 64)
 HBD_OBMCVARWXH(64, 16)
-#if CONFIG_EXT_PARTITION
-HBD_OBMCVARWXH(32, 128)
-HBD_OBMCVARWXH(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
index 954a95b98..e6b40262d 100644
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -44,16 +44,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                            m0, [zbinq]              ; m0 = zbin
 
   ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers but we process them as 16 bit numbers
   mova                            m9, [coeffq]
   packssdw                        m9, [coeffq+16]          ; m9 = c[i]
   mova                           m10, [coeffq+32]
   packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-%else
-  mova                            m9, [coeffq]             ; m9 = c[i]
-  mova                           m10, [coeffq+16]          ; m10 = c[i]
-%endif
 
   mov                             r0, eobmp                ; Output pointer
   mov                             r1, qcoeffmp             ; Output pointer
@@ -76,15 +71,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   ptest                          m14, m14
   jnz .single_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova                       [r1   ], ymm5
   mova                       [r1+32], ymm5
   mova                       [r2   ], ymm5
   mova                       [r2+32], ymm5
-%else
-  mova                          [r1], ymm5
-  mova                          [r2], ymm5
-%endif
   mov                           [r0], word 0
 
   vzeroupper
@@ -124,7 +114,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pand                            m8, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -136,16 +125,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmovsxwd                       m11, m13
   mova                  [qcoeffq+32], m11
   mova                  [qcoeffq+48], m6
-%else
-  mova                  [qcoeffq   ], m8
-  mova                  [qcoeffq+16], m13
-%endif
 
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
 
-%if CONFIG_HIGHBITDEPTH
   ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -157,10 +141,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmovsxwd                       m11, m13
   mova                 [dqcoeffq+32], m11
   mova                 [dqcoeffq+48], m6
-%else
-  mova                 [dqcoeffq   ], m8
-  mova                 [dqcoeffq+16], m13
-%endif
 
   mova                            m6, [iscanq]            ; m6 = scan[i]
   mova                           m11, [iscanq+16]         ; m11 = scan[i]
@@ -229,29 +209,20 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
 
-%if CONFIG_HIGHBITDEPTH
+
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
+
   lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [coeffq+ncoeffq*4+16]
   mova                           m10, [coeffq+ncoeffq*4+32]
   packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
 
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
@@ -264,16 +235,10 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   ptest                          m14, m14
   jnz .first_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4   ], ymm5
   mova        [qcoeffq+ncoeffq*4+32], ymm5
   mova       [dqcoeffq+ncoeffq*4   ], ymm5
   mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova           [qcoeffq+ncoeffq*2], ymm5
-  mova          [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
   add                        ncoeffq, mmsize
 
   punpckhqdq                      m1, m1
@@ -302,7 +267,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pand                            m8, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -314,10 +278,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
 
 %ifidn %1, b_32x32
   pabsw                           m8, m8
@@ -333,7 +293,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -345,10 +304,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
 
   pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
@@ -363,16 +318,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
 .ac_only_loop:
 
-%if CONFIG_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [coeffq+ncoeffq*4+16]
   mova                           m10, [coeffq+ncoeffq*4+32]
   packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
 
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
@@ -385,15 +335,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   ptest                          m14, m14
   jnz .rest_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4+ 0], ymm5
   mova        [qcoeffq+ncoeffq*4+32], ymm5
   mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
   mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
+
   add                        ncoeffq, mmsize
   jnz .ac_only_loop
 
@@ -424,7 +370,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pand                           m14, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
@@ -436,10 +381,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
 
 %ifidn %1, b_32x32
   pabsw                          m14, m14
@@ -454,7 +395,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
@@ -466,10 +406,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
 
   pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
@@ -510,27 +446,16 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
 
-%if CONFIG_HIGHBITDEPTH
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
   neg                        ncoeffq
   pxor                            m7, m7
 
 .blank_loop:
-%if CONFIG_HIGHBITDEPTH
   mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
   mova       [dqcoeffq+ncoeffq*4+32], ymm7
   mova        [qcoeffq+ncoeffq*4+ 0], ymm7
   mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
 
@@ -543,5 +468,3 @@ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
 INIT_XMM avx
 QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 0e7f679d0..46b9c7d29 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -12,7 +12,8 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
index 36b4dddbd..e2c1ebb71 100644
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -45,7 +45,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
   mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
+  psubw                           m0, [GLOBAL(pw_1)]
   mov                             r2, shiftmp
   mov                             r3, qcoeffmp
   mova                            m4, [r2]                 ; m4 = shift
@@ -56,29 +56,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_HIGHBITDEPTH
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
   lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
   packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -99,7 +88,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
   pand                           m13, m12
-%if CONFIG_HIGHBITDEPTH
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                           m11, m8
   mova                            m6, m8
@@ -117,10 +106,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
+
 %ifidn %1, b_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
@@ -134,7 +120,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                          m8, m9
   psignw                         m13, m10
 %endif
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                            m11, m8
   mova                            m6, m8
@@ -152,10 +137,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -169,16 +150,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   jz .accumulate_eob
 
 .ac_only_loop:
-%if CONFIG_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
   packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
+
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -201,7 +178,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
   pand                           m13, m12
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pxor                           m11, m11
   mova                           m11, m14
@@ -220,10 +196,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
+
 %ifidn %1, b_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
@@ -236,7 +209,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-%if CONFIG_HIGHBITDEPTH
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                           m11, m14
   mova                            m6, m14
@@ -254,10 +227,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
+
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -274,7 +244,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %ifidn %1, b_32x32
   jmp .accumulate_eob
 .skip_iter:
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4+ 0], m5
   mova        [qcoeffq+ncoeffq*4+16], m5
   mova        [qcoeffq+ncoeffq*4+32], m5
@@ -283,12 +252,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+16], m5
   mova       [dqcoeffq+ncoeffq*4+32], m5
   mova       [dqcoeffq+ncoeffq*4+48], m5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-%endif
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 %endif
@@ -313,17 +276,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
   DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_HIGHBITDEPTH
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
-%if CONFIG_HIGHBITDEPTH
   mova       [dqcoeffq+ncoeffq*4+ 0], m7
   mova       [dqcoeffq+ncoeffq*4+16], m7
   mova       [dqcoeffq+ncoeffq*4+32], m7
@@ -332,12 +289,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+16], m7
   mova        [qcoeffq+ncoeffq*4+32], m7
   mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
   mov                    word [eobq], 0
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
index e60f518b4..f662b62b1 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>  // AVX2
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
index 2c67f450f..55a856985 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -233,11 +233,9 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
 %endmacro
 
 INIT_XMM sse2
-%if CONFIG_EXT_PARTITION
 SADNXN4D 128, 128
 SADNXN4D 128, 64
 SADNXN4D 64,  128
-%endif
 SADNXN4D 64, 64
 SADNXN4D 64, 32
 SADNXN4D 32, 64
@@ -251,11 +249,9 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
-%if CONFIG_EXT_PARTITION_TYPES
 SADNXN4D  4, 16
 SADNXN4D 16,  4
 SADNXN4D  8, 32
 SADNXN4D 32,  8
 SADNXN4D 16, 64
 SADNXN4D 64, 16
-%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
index efba61289..a50dba64a 100644
--- a/third_party/aom/aom_dsp/x86/sad_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
 #define FSAD64_H(h)                                                           \
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
index e8dd87a26..b506d4663 100644
--- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
@@ -11,10 +11,11 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 // SAD
@@ -360,7 +361,6 @@ unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
-#if CONFIG_EXT_PARTITION
 static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
                      const uint16_t *sec_ptr, __m256i *sad_acc) {
   __m256i s[8], r[8];
@@ -471,7 +471,6 @@ unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
   sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
   return sum;
 }
-#endif  // CONFIG_EXT_PARTITION
 
 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
 static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
@@ -649,7 +648,6 @@ unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
-#if CONFIG_EXT_PARTITION
 unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
                                            const uint8_t *ref, int ref_stride,
                                            const uint8_t *second_pred) {
@@ -697,19 +695,13 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
                                        second_pred);
   return sum;
 }
-#endif  // CONFIG_EXT_PARTITION
 
 // SAD 4D
 // Combine 4 __m256i vectors to uint32_t result[4]
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  const __m256i mask = _mm256_setr_epi32(UINT32_MAX, 0, UINT32_MAX, 0,
-                                         UINT32_MAX, 0, UINT32_MAX, 0);
-#else
-  const __m256i mask = _mm256_set1_epi64x(UINT32_MAX);
-#endif
+  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
   __m128i sad;
 
   // 8 32-bit summation
@@ -967,7 +959,6 @@ void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
   sad_array[3] = first_half[3] + second_half[3];
 }
 
-#if CONFIG_EXT_PARTITION
 void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref_array[],
                                   int ref_stride, uint32_t *sad_array) {
@@ -1045,4 +1036,3 @@ void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
   sad_array[2] = first_half[2] + second_half[2];
   sad_array[3] = first_half[3] + second_half[3];
 }
-#endif  // CONFIG_EXT_PARTITION
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
index 4419c65b2..c6fd62c9e 100644
--- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -10,7 +10,8 @@
  */
 
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
 
 static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride) {
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
index b4cc6abf1..3251b7655 100644
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -47,7 +47,6 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 %endif ; %3 == 7
 %endmacro
 
-%if CONFIG_EXT_PARTITION
 ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD128XN 1-2 0
@@ -114,7 +113,6 @@ SAD128XN 128     ; sad128x128_sse2
 SAD128XN 128, 1  ; sad128x128_avg_sse2
 SAD128XN 64      ; sad128x64_sse2
 SAD128XN 64, 1   ; sad128x64_avg_sse2
-%endif
 
 
 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
@@ -155,18 +153,14 @@ SAD128XN 64, 1   ; sad128x64_avg_sse2
 %endmacro
 
 INIT_XMM sse2
-%if CONFIG_EXT_PARTITION
 SAD64XN 128     ; sad64x128_sse2
 SAD64XN 128, 1  ; sad64x128_avg_sse2
-%endif
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD64XN 16 ; sad64x16_sse2
 SAD64XN 16, 1 ; sad64x16_avg_sse2
-%endif
 
 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
@@ -212,10 +206,8 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD32XN 8 ; sad_32x8_sse2
 SAD32XN 8, 1 ; sad_32x8_avg_sse2
-%endif
 
 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -262,12 +254,10 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD16XN 4 ; sad_16x4_sse2
 SAD16XN 4, 1 ; sad_16x4_avg_sse2
 SAD16XN 64 ; sad_16x64_sse2
 SAD16XN 64, 1 ; sad_16x64_avg_sse2
-%endif
 
 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
@@ -312,10 +302,8 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD8XN 32 ; sad_8x32_sse2
 SAD8XN 32, 1 ; sad_8x32_avg_sse2
-%endif
 
 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
@@ -361,7 +349,5 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
-%if CONFIG_EXT_PARTITION_TYPES
 SAD4XN 16 ; sad_4x16_sse2
 SAD4XN 16, 1 ; sad_4x16_avg_sse2
-%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm
deleted file mode 100644
index f6c27c855..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,377 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBAOM_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBAOM_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-;void int aom_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x16x3_sse3) PRIVATE
-sym(aom_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x8x3_sse3) PRIVATE
-sym(aom_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad8x16x3_sse3) PRIVATE
-sym(aom_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad8x8x3_sse3) PRIVATE
-sym(aom_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad4x4x3_sse3) PRIVATE
-sym(aom_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 5e9c75845..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,362 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-;void aom_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(aom_sad16x16x8_sse4_1) PRIVATE
-sym(aom_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad16x8x8_sse4_1) PRIVATE
-sym(aom_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad8x8x8_sse4_1) PRIVATE
-sym(aom_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad8x16x8_sse4_1) PRIVATE
-sym(aom_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad4x4x8_sse4_1) PRIVATE
-sym(aom_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index 96b64b040..000000000
--- a/third_party/aom/aom_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,373 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int aom_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x16x3_ssse3) PRIVATE
-sym(aom_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .aom_sad16x16x3_ssse3_skiptable
-.aom_sad16x16x3_ssse3_jumptable:
-        dd .aom_sad16x16x3_ssse3_aligned_by_0  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_1  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_2  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_3  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_4  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_5  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_6  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_7  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_8  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_9  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump
-.aom_sad16x16x3_ssse3_skiptable:
-
-        call .aom_sad16x16x3_ssse3_do_jump
-.aom_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of aom_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3
-
-.aom_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.aom_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int aom_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x8x3_ssse3) PRIVATE
-sym(aom_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .aom_sad16x8x3_ssse3_skiptable
-.aom_sad16x8x3_ssse3_jumptable:
-        dd .aom_sad16x8x3_ssse3_aligned_by_0  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_1  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_2  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_3  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_4  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_5  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_6  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_7  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_8  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_9  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump
-.aom_sad16x8x3_ssse3_skiptable:
-
-        call .aom_sad16x8x3_ssse3_do_jump
-.aom_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of aom_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3
-
-.aom_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.aom_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
index aa70106c8..6d9b5a12f 100644
--- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
@@ -47,6 +47,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
index d3feb7ec0..45bf6ec3c 100644
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -117,27 +117,26 @@ SECTION .text
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse, \
+                                          g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -155,9 +154,9 @@ SECTION .text
 
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                                y_offset, dst, dst_stride, height, sse, \
-                                g_bilin_filter, g_pw_8
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse, g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
@@ -176,25 +175,18 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                              y_offset, dst, dst_stride, height, sse
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
       %define block_height heightd
     %endif
-
     %define bilin_filter bilin_filter_m
   %endif
 %endif
@@ -374,8 +366,8 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -383,7 +375,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -400,7 +392,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -697,8 +689,8 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -706,7 +698,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -723,7 +715,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -855,8 +847,8 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -864,7 +856,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -881,7 +873,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -997,8 +989,8 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -1006,7 +998,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -1023,7 +1015,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1195,8 +1187,8 @@ SECTION .text
   STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
@@ -1209,7 +1201,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -1237,7 +1229,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
index 7bd5b23ad..1a75a234f 100644
--- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -34,10 +34,8 @@ cglobal subtract_block, 7, 7, 8, \
   je .case_16
   cmp                colsd, 32
   je .case_32
-%if CONFIG_EXT_PARTITION
   cmp                colsd, 64
   je .case_64
-%endif
 
 %macro loop16 6
   mova                  m0, [srcq+%1]
@@ -62,7 +60,6 @@ cglobal subtract_block, 7, 7, 8, \
   mova [diffq+mmsize*1+%6], m1
 %endmacro
 
-%if CONFIG_EXT_PARTITION
   mov             pred_str, pred_stridemp
 .loop_128:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
@@ -77,7 +74,6 @@ cglobal subtract_block, 7, 7, 8, \
   RET
 
 .case_64:
-%endif
   mov             pred_str, pred_stridemp
 .loop_64:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
index 6be99fbca..a79f22d79 100644
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -14,35 +14,62 @@
 #include <stdio.h>
 
 #include "aom_dsp/x86/synonyms.h"
+#include "config/aom_dsp_rtcd.h"
 
-#include "./aom_dsp_rtcd.h"
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, a);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
 
 static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
                                                 int stride) {
-  const __m128i v_val_0_w =
-      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
-  const __m128i v_val_1_w =
-      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
-  const __m128i v_val_2_w =
-      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
-  const __m128i v_val_3_w =
-      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
-  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-  const __m128i v_sum_d =
+  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+  __m128i v_sum_d =
       _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
+  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
 
+static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                                int height) {
+  int r = 0;
+  __m128i v_acc_q = _mm_setzero_si128();
+  do {
+    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
+  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+  return xx_cvtsi128_si64(v_acc_64);
+}
+
 #ifdef __GNUC__
 // This prevents GCC/Clang from inlining this function into
 // aom_sum_squares_2d_i16_sse2, which in turn saves some stack
@@ -52,72 +79,45 @@ __attribute__((noinline))
 static uint64_t
 aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                 int height) {
-  int r, c;
+  int r = 0;
 
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
   __m128i v_acc_q = _mm_setzero_si128();
 
-  for (r = 0; r < height; r += 8) {
+  do {
     __m128i v_acc_d = _mm_setzero_si128();
-
-    for (c = 0; c < width; c += 8) {
+    int c = 0;
+    do {
       const int16_t *b = src + c;
 
-      const __m128i v_val_0_w =
-          _mm_load_si128((const __m128i *)(b + 0 * stride));
-      const __m128i v_val_1_w =
-          _mm_load_si128((const __m128i *)(b + 1 * stride));
-      const __m128i v_val_2_w =
-          _mm_load_si128((const __m128i *)(b + 2 * stride));
-      const __m128i v_val_3_w =
-          _mm_load_si128((const __m128i *)(b + 3 * stride));
-      const __m128i v_val_4_w =
-          _mm_load_si128((const __m128i *)(b + 4 * stride));
-      const __m128i v_val_5_w =
-          _mm_load_si128((const __m128i *)(b + 5 * stride));
-      const __m128i v_val_6_w =
-          _mm_load_si128((const __m128i *)(b + 6 * stride));
-      const __m128i v_val_7_w =
-          _mm_load_si128((const __m128i *)(b + 7 * stride));
+      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
 
       const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
       const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
       const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
       const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
 
       const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
       const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
 
       const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
 
       v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
-    }
+      c += 8;
+    } while (c < width);
 
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
 
-    src += 8 * stride;
-  }
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
 
   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
-    return tmp;
-  }
-#endif
+  return xx_cvtsi128_si64(v_acc_q);
 }
 
 uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
@@ -127,7 +127,9 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
   // are with size == 4, so it is also the common case.
   if (LIKELY(width == 4 && height == 4)) {
     return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) {
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
     // Generic case
     return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
   } else {
@@ -140,7 +142,7 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
 //////////////////////////////////////////////////////////////////////////////
 
 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
   __m128i v_acc0_q = _mm_setzero_si128();
   __m128i v_acc1_q = _mm_setzero_si128();
 
@@ -185,16 +187,7 @@ static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
 
   v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
   v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
-    return tmp;
-  }
-#endif
+  return xx_cvtsi128_si64(v_acc0_q);
 }
 
 uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index cd049a454..d9a53fcc5 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -14,7 +14,8 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 /**
@@ -58,6 +59,28 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
   _mm_storeu_si128((__m128i *)a, v);
 }
 
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, e1, 0, e0);
+#else
+  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, a, 0, a);
+#else
+  return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
 static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
   return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
 }
@@ -89,4 +112,12 @@ static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
 #endif  // AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 000000000..39f371fc9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+  return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+  _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+  _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+  return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+#endif  // AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 000000000..f88a1527d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
deleted file mode 100644
index 1a8fed710..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
-#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
-
-#include <immintrin.h>
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/common_avx2.h"
-
-#define pair256_set_epi16(a, b)                                            \
-  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define pair256_set_epi32(a, b)                                                \
-  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
-                   (int)(b), (int)(a))
-
-static INLINE void mm256_reverse_epi16(__m256i *u) {
-  const __m256i control = _mm256_set_epi16(
-      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
-      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
-  __m256i v = _mm256_shuffle_epi8(*u, control);
-  *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
-static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
-                                 const __m256i *cospi) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i y0 = _mm256_madd_epi16(*a0, *cospi);
-  __m256i y1 = _mm256_madd_epi16(*a1, *cospi);
-
-  y0 = _mm256_add_epi32(y0, dct_rounding);
-  y1 = _mm256_add_epi32(y1, dct_rounding);
-  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
-  return _mm256_packs_epi32(y0, y1);
-}
-
-static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
-  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i u0, u1;
-  int i = 0;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 1);
-
-    u0 = _mm256_unpacklo_epi16(zero, in[i]);
-    u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
-    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
-    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
-    u0 = _mm256_add_epi32(u0, dct_const_rounding);
-    u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
-    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-    in[i] = _mm256_packs_epi32(u0, u1);
-    i++;
-  }
-}
-
-#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
deleted file mode 100644
index 4e6eecd32..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
-#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_storeu_si128((__m128i *)(dst_ptr), out0);
-    _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-  }
-}
-
-#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
index 4257d8b9c..58a792424 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -16,17 +16,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
-#define pair_set_epi16(a, b)                                            \
-  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define dual_set_epi16(a, b)                                            \
-  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
-                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-
-#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
-  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
-                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
 
 // Reverse the 8 16 bit words in __m128i
 static INLINE __m128i mm_reverse_epi16(const __m128i x) {
@@ -35,292 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
-#if CONFIG_EXT_TX
-// Identity transform (both forward and inverse).
-static INLINE void idtx16_8col(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
-
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-  in[8] = _mm_slli_epi16(in[8], 1);
-  in[9] = _mm_slli_epi16(in[9], 1);
-  in[10] = _mm_slli_epi16(in[10], 1);
-  in[11] = _mm_slli_epi16(in[11], 1);
-  in[12] = _mm_slli_epi16(in[12], 1);
-  in[13] = _mm_slli_epi16(in[13], 1);
-  in[14] = _mm_slli_epi16(in[14], 1);
-  in[15] = _mm_slli_epi16(in[15], 1);
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
-  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
-  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
-  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
-
-  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
-  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
-  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
-  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
-  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
-  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
-  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
-  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
-
-  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
-  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
-  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
-  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
-  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
-  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
-  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
-  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
-
-  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
-  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
-  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
-  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
-  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
-  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
-  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
-  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
-
-  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
-  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
-  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
-  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
-
-  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
-  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
-  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
-  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
-  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
-  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
-  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
-  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
-
-  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
-  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
-  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
-  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
-  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
-
-  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
-  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
-  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
-  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
-  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
-  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
-  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
-  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
-  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
-  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
-  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
-  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
-  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
-  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
-
-  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(v0, x0);
-  in[1] = _mm_packs_epi32(v1, x1);
-  in[2] = _mm_packs_epi32(v2, x2);
-  in[3] = _mm_packs_epi32(v3, x3);
-  in[4] = _mm_packs_epi32(v4, x4);
-  in[5] = _mm_packs_epi32(v5, x5);
-  in[6] = _mm_packs_epi32(v6, x6);
-  in[7] = _mm_packs_epi32(v7, x7);
-
-  in[8] = _mm_packs_epi32(u0, y0);
-  in[9] = _mm_packs_epi32(u1, y1);
-  in[10] = _mm_packs_epi32(u2, y2);
-  in[11] = _mm_packs_epi32(u3, y3);
-  in[12] = _mm_packs_epi32(u4, y4);
-  in[13] = _mm_packs_epi32(u5, y5);
-  in[14] = _mm_packs_epi32(u6, y6);
-  in[15] = _mm_packs_epi32(u7, y7);
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void scale_sqrt2_8x4(__m128i *in) {
-  // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
-  // consecutive elements.
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
-  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
-  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
-  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
-  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
-}
-
-static INLINE void scale_sqrt2_8x8(__m128i *in) {
-  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
-  // for each element.
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
-  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
-  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
-  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
-  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
-  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
-  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
-  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
-  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
-  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
-  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
-  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
-  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
-  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
-  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
-  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
-  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
-  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
-  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
-  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
-  in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
-  in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
-  in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
-  in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
-}
-
-static INLINE void scale_sqrt2_8x16(__m128i *in) {
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-}
-
 #endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 18a70dffe..7d6b7d287 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -10,109 +10,224 @@
  */
 
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
-
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse, int *sum);
-
-void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum);
-
-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, int w, int h,
-                          unsigned int *sse, int *sum, get_var_avx2 var_fn,
-                          int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+  return _mm_add_epi16(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
 }
 
-unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                aom_get16x16var_avx2, 16);
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+  return _mm_add_epi32(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
 
-  variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-  _mm256_zeroupper();
-  return variance;
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
 }
 
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  _mm256_zeroupper();
-  return *sse;
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+                                                     unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+                                          unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 =
+      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  vsum = sum_to_32bit_avx2(vsum);
+  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
 }
 
-unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m256i *const vsse,
+                                    __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum;                                                             \
+    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
-  _mm256_zeroupper();
-  return variance;
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum = _mm256_setzero_si256();                                    \
+    for (int i = 0; i < (bh / uh); i++) {                                     \
+      __m256i vsum16;                                                         \
+      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
+      src += uh * src_stride;                                                 \
+      ref += uh * ref_stride;                                                 \
+    }                                                                         \
+    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
+    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
+  }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
 }
 
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
@@ -125,68 +240,164 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
     int height, unsigned int *sseptr);
 
-unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  unsigned int sse1;
-  const int se1 = aom_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
-  unsigned int sse2;
-  const int se2 =
-      aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                      dst + 32, dst_stride, 64, &sse2);
-  const int se = se1 + se2;
-  unsigned int variance;
-  *sse = sse1 + sse2;
-
-  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  const int se = aom_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
-
-  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_sub_pixel_avg_variance64x64_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
-  unsigned int sse1;
-  const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
-  unsigned int sse2;
-  const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
-      src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
-      64, 64, &sse2);
-  const int se = se1 + se2;
-  unsigned int variance;
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2);                                                           \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
+  }
+
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    /*Avoid overflow in helper by capping height.*/                       \
+    const int hf = AOMMIN(h, 64);                                         \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, hf, &sse2);                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
 
-  *sse = sse1 + sse2;
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
 
-  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
-  _mm256_zeroupper();
-  return variance;
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 
-unsigned int aom_sub_pixel_avg_variance32x32_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
-  // Process 32 elements in parallel.
-  const int se = aom_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
-
-  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
-  _mm256_zeroupper();
-  return variance;
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+                                            const __m256i a,
+                                            uint8_t *comp_pred) {
+  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  int i = 0;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      // comp_pred's stride == width == 16
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (16 << 2);
+      i += 4;
+    } while (i < height);
+  } else {  // for width == 32
+    do {
+      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
+      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
+      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
+
+      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
+      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
+      const __m256i aB =
+          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
+
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (32 << 1);
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  }
 }
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
index 999b541e3..88e27aef3 100644
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>  // AVX2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
 /* clang-format off */
@@ -35,203 +36,6 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
 };
 /* clang-format on */
 
-void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing two strides in a 256 bit register reducing the number
-  // of loop stride by half (comparing to the sse2 code)
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    src = _mm256_inserti128_si256(
-        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
-    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    ref = _mm256_inserti128_si256(
-        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-
-  {
-    __m128i sum_res, madd_res;
-    __m128i expand_sum_low, expand_sum_high, expand_sum;
-    __m128i expand_madd_low, expand_madd_high, expand_madd;
-    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // extract the low lane and add it to the high lane
-    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                            _mm256_extractf128_si256(sum_ref_src, 1));
-
-    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                             _mm256_extractf128_si256(madd_ref_src, 1));
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low =
-        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-    expand_sum_high =
-        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low =
-        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-    expand_madd_high =
-        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
-    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low =
-        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-    ex_expand_sum_high =
-        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
-    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_res = _mm_srli_si128(expand_madd, 8);
-    sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-    madd_res = _mm_add_epi32(madd_res, expand_madd);
-    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
-    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing 32 elements in parallel
-  for (i = 0; i < 16; i++) {
-    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
-    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-
-  {
-    __m256i expand_sum_low, expand_sum_high, expand_sum;
-    __m256i expand_madd_low, expand_madd_high, expand_madd;
-    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-    // extract the low lane and the high lane and add the results
-    *((int *)SSE) =
-        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-  }
-  _mm256_zeroupper();
-}
-
 #define FILTER_SRC(filter)                               \
   /* filter the source */                                \
   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 211fad3f8..c8c90a7dc 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -12,24 +12,24 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
-
-typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
   int i;
 
   for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v = xx_loadu_128(src);
     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
     src += 8;
   }
@@ -39,276 +39,265 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   return _mm_cvtsi128_si32(vsum);
 }
 
-#define READ64(p, stride, i)                                  \
-  _mm_unpacklo_epi8(                                          \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
 
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
 
-  // sse
-  vsum =
-      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return _mm_cvtsi128_si32(val);
 }
 
-void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
-                        int ref_stride, unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
 
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-  }
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src, ref);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  // sum
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
 }
 
-void aom_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
 
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = add32x4_sse2(vsum);
+}
 
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+                                                unsigned int *const sse,
+                                                int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-    src += src_stride;
-    ref += ref_stride;
-  }
+  vsum = sum_to_32bit_sse2(vsum);
+  *sum = add32x4_sse2(vsum);
+}
 
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum =
-      (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 256);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
 
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
+  for (int i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src, src_stride);
+    const __m128i r = load4x2_sse2(ref, ref_stride);
 
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride, int w,
-                          int h, unsigned int *sse, int *sum,
-                          getNxMvar_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
+    variance_kernel_sse2(s, r, sse, sum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
   }
 }
 
-unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 4 * 4);
-  assert(sum >= -255 * 4 * 4);
-  return *sse - ((sum * sum) >> 4);
-}
-
-unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 8 * 4);
-  assert(sum >= -255 * 8 * 4);
-  return *sse - ((sum * sum) >> 5);
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+  for (int i = 0; i < h; i++) {
+    const __m128i s = load8_8to16_sse2(src);
+    const __m128i r = load8_8to16_sse2(ref);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 8 * 4);
-  assert(sum >= -255 * 8 * 4);
-  return *sse - ((sum * sum) >> 5);
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
 }
 
-unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 8 * 8);
-  assert(sum >= -255 * 8 * 8);
-  return *sse - ((sum * sum) >> 6);
-}
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 64);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
 
-unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 16 * 8);
-  assert(sum >= -255 * 16 * 8);
-  return *sse - ((sum * sum) >> 7);
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src, ref, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 16 * 8);
-  assert(sum >= -255 * 16 * 8);
-  return *sse - ((sum * sum) >> 7);
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 16 * 16);
-  assert(sum >= -255 * 16 * 16);
-  return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 16);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 32);
-  assert(sum >= -255 * 32 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m128i *const sse,
+                                    __m128i *const sum) {
+  assert(h <= 8);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const int offset0 = j << 5;
+      const int offset1 = offset0 + 16;
+      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum;                                                             \
+    int sum = 0;                                                              \
+    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum = _mm_setzero_si128();                                       \
+    for (int i = 0; i < (bh / uh); ++i) {                                     \
+      __m128i vsum16;                                                         \
+      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
+      src += (src_stride * uh);                                               \
+      ref += (ref_stride * uh);                                               \
+    }                                                                         \
+    *sse = add32x4_sse2(vsse);                                                \
+    int sum = add32x4_sse2(vsum);                                             \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 64);
-  assert(sum >= -255 * 64 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
-}
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
 
-unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
 
-unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
 
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
@@ -338,74 +327,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
-unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 4 * 16);
-  assert(sum >= -255 * 4 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
-}
-
-unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 16 * 4);
-  assert(sum >= -255 * 16 * 4);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
-}
-
-unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 8 * 32);
-  assert(sum >= -255 * 8 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
-}
-
-unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 32 * 8);
-  assert(sum >= -255 * 32 * 8);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
-}
-
-unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 16 * 64);
-  assert(sum >= -255 * 16 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 16);
-  assert(sum >= -255 * 64 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-#endif
-
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
 #define DECL(w, opt)                                                           \
@@ -423,75 +344,57 @@ DECLS(ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
-    unsigned int sse;                                                          \
-    int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
-                                                  y_offset, dst, dst_stride,   \
-                                                  h, &sse, NULL, NULL);        \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_sub_pixel_variance##wf##xh_##opt(                          \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));   \
-  FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t));    \
-  FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t));  \
-  FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
-#else
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
-#endif
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 
 FNS(sse2);
 FNS(ssse3);
@@ -516,76 +419,61 @@ DECLS(ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                    \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
-      const uint8_t *sec) {                                                    \
-    unsigned int sse;                                                          \
-    int se = aom_sub_pixel_avg_variance##wf##xh_##opt(                         \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                      \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sseptr = sse;                                                             \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));   \
-  FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t));    \
-  FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t));  \
-  FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
-#else
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
-#endif
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 
 FNS(sse2);
 FNS(ssse3);
@@ -593,9 +481,97 @@ FNS(ssse3);
 #undef FNS
 #undef FN
 
-void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
                              int subpel_x_q3, int subpel_y_q3,
                              const uint8_t *ref, int ref_stride) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
       int i;
@@ -604,8 +580,7 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       for (i = 0; i < height; i++) {
         int j;
         for (j = 0; j < width; j += 16) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
           comp_pred += 16;
           ref += 16;
         }
@@ -617,10 +592,9 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       assert(!(height & 1));
       /*Read 8 pixels two rows at a time.*/
       for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
         comp_pred += 16;
         ref += 2 * ref_stride;
       }
@@ -630,69 +604,62 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       assert(!(height & 3));
       /*Read 4 pixels four rows at a time.*/
       for (i = 0; i < height; i++) {
-        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
-        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride));
-        __m128i s2 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride));
-        __m128i s3 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride));
-        __m128i t0 = _mm_unpacklo_epi32(s0, s1);
-        __m128i t1 = _mm_unpacklo_epi32(s2, s3);
-        __m128i u0 = _mm_unpacklo_epi64(t0, t1);
-        _mm_storeu_si128((__m128i *)comp_pred, u0);
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
         comp_pred += 16;
         ref += 4 * ref_stride;
       }
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t,
-                      temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-      aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-    }
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
+                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                        intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                       width, height);
   }
 }
 
-void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
-                                      int width, int height, int subpel_x_q3,
-                                      int subpel_y_q3, const uint8_t *ref,
-                                      int ref_stride) {
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride) {
   int n;
   int i;
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
   for (i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0));
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
     comp_pred += 16;
     pred += 16;
   }
author	trav90 <travawine@palemoon.org>	2018-10-19 21:52:15 -0500
committer	trav90 <travawine@palemoon.org>	2018-10-19 21:52:20 -0500
commit	bbcc64772580c8a979288791afa02d30bc476d2e (patch)
tree	437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/aom_dsp
parent	14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff)
download	UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip