99 files changed, 9493 insertions, 26260 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index 4067b0b53..401fbdc48 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
@@ -20,12 +21,6 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2;
 filter8_1dfunction aom_filter_block1d8_h8_sse2;
 filter8_1dfunction aom_filter_block1d4_v8_sse2;
 filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_avg_sse2;
 
 filter8_1dfunction aom_filter_block1d16_v2_sse2;
 filter8_1dfunction aom_filter_block1d16_h2_sse2;
@@ -33,12 +28,6 @@ filter8_1dfunction aom_filter_block1d8_v2_sse2;
 filter8_1dfunction aom_filter_block1d8_h2_sse2;
 filter8_1dfunction aom_filter_block1d4_v2_sse2;
 filter8_1dfunction aom_filter_block1d4_h2_sse2;
-filter8_1dfunction aom_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
 
 // void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -50,47 +39,16 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_sse2;
 //                              const int16_t *filter_x, int x_step_q4,
 //                              const int16_t *filter_y, int y_step_q4,
 //                              int w, int h);
-// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
 
-#if CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#if ARCH_X86_64
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2;
 
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
@@ -98,12 +56,6 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
 
 // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
 //                                      ptrdiff_t src_stride,
@@ -123,60 +75,8 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2;
 //                                     const int16_t *filter_y,
 //                                     int y_step_q4,
 //                                     int w, int h, int bd);
-// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h, int bd);
-// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
 
-#if CONFIG_LOOP_RESTORATION
-// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
-// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
-void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h, int bd) {
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  ((int16_t *)filter_x)[3] += 128;
-  ((int16_t *)filter_y)[3] += 128;
-  aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h, bd);
-  ((int16_t *)filter_x)[3] -= 128;
-  ((int16_t *)filter_y)[3] -= 128;
-}
-#endif  // CONFIG_LOOP_RESTORATION
-#endif  // CONFIG_HIGHBITDEPTH && ARCH_X86_64
+#endif  // ARCH_X86_64
 #endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
index 4d3142867..7283c32b8 100644
--- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -50,7 +50,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   cmp r4d, 32
   je .w32
 
-%if CONFIG_AV1 && CONFIG_EXT_PARTITION
   cmp r4d, 64
   je .w64
 %ifidn %2, highbd
@@ -160,50 +159,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   jnz .loop128
   RET
 
-%else  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
-
-%ifidn %2, highbd
-  cmp r4d, 64
-  je .w64
-
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-%endif
-%endif  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
-
 .w64:
   mov                    r4d, dword hm
 .loop64:
@@ -339,7 +294,4 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
 INIT_XMM sse2
 convolve_fn copy
 convolve_fn avg
-%if CONFIG_HIGHBITDEPTH
 convolve_fn copy, highbd
-convolve_fn avg, highbd
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c
deleted file mode 100644
index 14352895d..000000000
--- a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h) {
-  const int bd = 8;
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero),
-                            _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
-        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p, res_8bit);
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
index e6d357ba3..b6f040791 100644
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -200,6 +200,8 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void aom_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -392,169 +394,6 @@ sym(aom_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movq        xmm0, [rsi]                 ;load src: row 0
-    movq        xmm1, [rsi + rax]           ;1
-    movq        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2]       ;7
-    movq        xmm2, [rsi + rax]           ;2
-    movq        xmm3, [rsi + rax * 2]       ;3
-    movq        xmm4, [rsi + rdx]           ;4
-    movq        xmm5, [rsi + rax * 4]       ;5
-
-    HIGH_APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 16
-    HIGH_APPLY_FILTER_8 1, 16
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void aom_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
@@ -772,194 +611,3 @@ sym(aom_highbd_filter_block1d16_h8_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm4,   [rsi + 2]
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm4
-    movdqa      xmm7, xmm4
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm4
-
-    psrldq      xmm1, 2
-    psrldq      xmm6, 4
-    psrldq      xmm7, 6
-    psrldq      xmm2, 4
-    psrldq      xmm3, 6
-    psrldq      xmm5, 2
-
-    HIGH_APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 10]           ;load src
-    movdqu      xmm1,   [rsi + 12]
-    movdqu      xmm2,   [rsi + 14]
-    movdqu      xmm3,   [rsi + 16]
-    movdqu      xmm4,   [rsi + 18]
-    movdqu      xmm5,   [rsi + 20]
-    movdqu      xmm6,   [rsi + 22]
-    movdqu      xmm7,   [rsi + 24]
-
-    HIGH_APPLY_FILTER_8 1, 16
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index 9e2ec748c..7b3fe6419 100644
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -174,6 +174,8 @@
 %endm
 %endif
 
+SECTION .text
+
 global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_v2_sse2):
     push        rbp
@@ -254,86 +256,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     ret
 %endif
 
-global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movq        xmm0, [rsi]                 ;load src
-    movq        xmm1, [rsi + 2*rax]
-
-    HIGH_APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;0
-    movdqu      xmm1, [rsi + 2*rax]         ;1
-
-    HIGH_APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + 2*rax]       ;1
-    movdqu        xmm2, [rsi + 16]
-    movdqu        xmm3, [rsi + 2*rax + 16]
-
-    HIGH_APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
-
 global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_h2_sse2):
     push        rbp
@@ -414,84 +336,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2):
     pop         rbp
     ret
 %endif
-
-global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 2
-
-    HIGH_APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqu      xmm1, [rsi + 2]
-
-    HIGH_APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 2]
-    movdqu      xmm2,   [rsi + 16]
-    movdqu      xmm3,   [rsi + 18]
-
-    HIGH_APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
deleted file mode 100644
index 74ce80e50..000000000
--- a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-void aom_highbd_convolve8_add_src_hip_ssse3(
-    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
-
-  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1);
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
-        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
-        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, res_16bit);
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 61476b8be..af45a03ac 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -11,31 +11,12 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
 #include "aom_ports/mem.h"
 
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
 #if defined(__clang__)
 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
@@ -566,10 +547,4 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3;
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
 
-// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index be37738df..6bcb4a512 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -11,7 +11,8 @@
 
 #include <tmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve.h"
 #include "aom_mem/aom_mem.h"
@@ -285,20 +286,6 @@ filter8_1dfunction aom_filter_block1d8_v8_ssse3;
 filter8_1dfunction aom_filter_block1d8_h8_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
-#if CONFIG_LOOP_RESTORATION
-filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
-#endif
 
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
@@ -306,12 +293,6 @@ filter8_1dfunction aom_filter_block1d8_v2_ssse3;
 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
 
 // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -323,598 +304,5 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
 //                               const int16_t *filter_x, int x_step_q4,
 //                               const int16_t *filter_y, int y_step_q4,
 //                               int w, int h);
-// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-#if CONFIG_LOOP_RESTORATION
-FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
-                        ssse3);
-FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
-                        src - src_stride * 3, add_src_, ssse3);
-#endif
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas.  The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = h + (8 - (h & 0x7));
-
-  do {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 8) {
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
-    }
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  for (y = 0; y < h; y += 4) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 4) {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
-    }
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-  }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  int i;
-
-  for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i *)&dst[i], temp_hi);
-  }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
-                            w);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
-#if CONFIG_LOOP_RESTORATION
-FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
-#endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
index b946010d3..c88fc9ffb 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -179,6 +179,8 @@
     movq        [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void aom_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -368,166 +370,6 @@ sym(aom_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 1
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_v8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 1, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 1, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void aom_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
@@ -771,220 +613,3 @@ sym(aom_filter_block1d16_h8_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 1
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_h8_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 1, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 8688fb544..3ca7921b6 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -375,17 +375,8 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
 
 INIT_XMM ssse3
 SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
 SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
 SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
-
-%if CONFIG_LOOP_RESTORATION
-SUBPIX_HFILTER16 h8_add_src
-SUBPIX_HFILTER8  h8_add_src
-SUBPIX_HFILTER4  h8_add_src
-%endif
 
 ;-------------------------------------------------------------------------------
 
@@ -875,15 +866,5 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
 
 INIT_XMM ssse3
 SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
 SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
 SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
-
-%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
-    CONFIG_LOOP_RESTORATION
-SUBPIX_VFILTER16 v8_add_src
-SUBPIX_VFILTER   v8_add_src, 8
-SUBPIX_VFILTER   v8_add_src, 4
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
index 8f025a8be..d0b4b2839 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -134,6 +134,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(aom_filter_block1d4_v2_sse2) PRIVATE
 sym(aom_filter_block1d4_v2_sse2):
     push        rbp
@@ -212,84 +214,6 @@ sym(aom_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_v2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 global sym(aom_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_filter_block1d4_h2_sse2):
     push        rbp
@@ -369,83 +293,3 @@ sym(aom_filter_block1d16_h2_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d4_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d8_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(aom_filter_block1d16_h2_avg_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index b9b2da0be..59edc49a9 100644
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -108,6 +108,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
 sym(aom_filter_block1d4_v2_ssse3):
     push        rbp
@@ -185,83 +187,6 @@ sym(aom_filter_block1d16_v2_ssse3):
     pop         rbp
     ret
 
-global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d4_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d8_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d16_v2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
 sym(aom_filter_block1d4_h2_ssse3):
     push        rbp
@@ -340,82 +265,3 @@ sym(aom_filter_block1d16_h2_ssse3):
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d4_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d8_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(aom_filter_block1d16_h2_avg_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 1
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
deleted file mode 100644
index 1a6457402..000000000
--- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/mem.h"
-
-void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
-                         int *min, int *max) {
-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
-  u0 = _mm_setzero_si128();
-  // Row 0
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff0 = _mm_max_epi16(diff, negdiff);
-  // Row 1
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
-  // Row 2
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 3
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 4
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 5
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 6
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 7
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
-  *max = _mm_extract_epi16(maxabsdiff, 0);
-
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
-  *min = _mm_extract_epi16(minabsdiff, 0);
-}
-
-static void hadamard_col8_sse2(__m128i *in, int iter) {
-  __m128i a0 = in[0];
-  __m128i a1 = in[1];
-  __m128i a2 = in[2];
-  __m128i a3 = in[3];
-  __m128i a4 = in[4];
-  __m128i a5 = in[5];
-  __m128i a6 = in[6];
-  __m128i a7 = in[7];
-
-  __m128i b0 = _mm_add_epi16(a0, a1);
-  __m128i b1 = _mm_sub_epi16(a0, a1);
-  __m128i b2 = _mm_add_epi16(a2, a3);
-  __m128i b3 = _mm_sub_epi16(a2, a3);
-  __m128i b4 = _mm_add_epi16(a4, a5);
-  __m128i b5 = _mm_sub_epi16(a4, a5);
-  __m128i b6 = _mm_add_epi16(a6, a7);
-  __m128i b7 = _mm_sub_epi16(a6, a7);
-
-  a0 = _mm_add_epi16(b0, b2);
-  a1 = _mm_add_epi16(b1, b3);
-  a2 = _mm_sub_epi16(b0, b2);
-  a3 = _mm_sub_epi16(b1, b3);
-  a4 = _mm_add_epi16(b4, b6);
-  a5 = _mm_add_epi16(b5, b7);
-  a6 = _mm_sub_epi16(b4, b6);
-  a7 = _mm_sub_epi16(b5, b7);
-
-  if (iter == 0) {
-    b0 = _mm_add_epi16(a0, a4);
-    b7 = _mm_add_epi16(a1, a5);
-    b3 = _mm_add_epi16(a2, a6);
-    b4 = _mm_add_epi16(a3, a7);
-    b2 = _mm_sub_epi16(a0, a4);
-    b6 = _mm_sub_epi16(a1, a5);
-    b1 = _mm_sub_epi16(a2, a6);
-    b5 = _mm_sub_epi16(a3, a7);
-
-    a0 = _mm_unpacklo_epi16(b0, b1);
-    a1 = _mm_unpacklo_epi16(b2, b3);
-    a2 = _mm_unpackhi_epi16(b0, b1);
-    a3 = _mm_unpackhi_epi16(b2, b3);
-    a4 = _mm_unpacklo_epi16(b4, b5);
-    a5 = _mm_unpacklo_epi16(b6, b7);
-    a6 = _mm_unpackhi_epi16(b4, b5);
-    a7 = _mm_unpackhi_epi16(b6, b7);
-
-    b0 = _mm_unpacklo_epi32(a0, a1);
-    b1 = _mm_unpacklo_epi32(a4, a5);
-    b2 = _mm_unpackhi_epi32(a0, a1);
-    b3 = _mm_unpackhi_epi32(a4, a5);
-    b4 = _mm_unpacklo_epi32(a2, a3);
-    b5 = _mm_unpacklo_epi32(a6, a7);
-    b6 = _mm_unpackhi_epi32(a2, a3);
-    b7 = _mm_unpackhi_epi32(a6, a7);
-
-    in[0] = _mm_unpacklo_epi64(b0, b1);
-    in[1] = _mm_unpackhi_epi64(b0, b1);
-    in[2] = _mm_unpacklo_epi64(b2, b3);
-    in[3] = _mm_unpackhi_epi64(b2, b3);
-    in[4] = _mm_unpacklo_epi64(b4, b5);
-    in[5] = _mm_unpackhi_epi64(b4, b5);
-    in[6] = _mm_unpacklo_epi64(b6, b7);
-    in[7] = _mm_unpackhi_epi64(b6, b7);
-  } else {
-    in[0] = _mm_add_epi16(a0, a4);
-    in[7] = _mm_add_epi16(a1, a5);
-    in[3] = _mm_add_epi16(a2, a6);
-    in[4] = _mm_add_epi16(a3, a7);
-    in[2] = _mm_sub_epi16(a0, a4);
-    in[6] = _mm_sub_epi16(a1, a5);
-    in[1] = _mm_sub_epi16(a2, a6);
-    in[5] = _mm_sub_epi16(a3, a7);
-  }
-}
-
-void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
-                           int16_t *coeff) {
-  __m128i src[8];
-  src[0] = _mm_load_si128((const __m128i *)src_diff);
-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-
-  hadamard_col8_sse2(src, 0);
-  hadamard_col8_sse2(src, 1);
-
-  _mm_store_si128((__m128i *)coeff, src[0]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
-}
-
-void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr =
-        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
-
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm_srai_epi16(b0, 1);
-    b1 = _mm_srai_epi16(b1, 1);
-    b2 = _mm_srai_epi16(b2, 1);
-    b3 = _mm_srai_epi16(b3, 1);
-
-    coeff0 = _mm_add_epi16(b0, b2);
-    coeff1 = _mm_add_epi16(b1, b3);
-    _mm_store_si128((__m128i *)coeff, coeff0);
-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
-
-    coeff2 = _mm_sub_epi16(b0, b2);
-    coeff3 = _mm_sub_epi16(b1, b3);
-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
-
-    coeff += 8;
-  }
-}
-
-int aom_satd_sse2(const int16_t *coeff, int length) {
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i accum = zero;
-
-  for (i = 0; i < length; i += 8) {
-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
-    const __m128i inv = _mm_sub_epi16(zero, src_line);
-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
-    accum = _mm_add_epi32(accum, sum);
-    coeff += 8;
-  }
-
-  {  // cascading summation of accum
-    __m128i hi = _mm_srli_si128(accum, 8);
-    accum = _mm_add_epi32(accum, hi);
-    hi = _mm_srli_epi64(accum, 32);
-    accum = _mm_add_epi32(accum, hi);
-  }
-
-  return _mm_cvtsi128_si32(accum);
-}
-
-void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride,
-                          int height) {
-  int idx;
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
-  __m128i t0, t1;
-  int height_1 = height - 1;
-  ref += ref_stride;
-
-  for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-  }
-
-  src_line = _mm_loadu_si128((const __m128i *)ref);
-  t0 = _mm_unpacklo_epi8(src_line, zero);
-  t1 = _mm_unpackhi_epi8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, t0);
-  s1 = _mm_adds_epu16(s1, t1);
-
-  if (height == 64) {
-    s0 = _mm_srai_epi16(s0, 5);
-    s1 = _mm_srai_epi16(s1, 5);
-  } else if (height == 32) {
-    s0 = _mm_srai_epi16(s0, 4);
-    s1 = _mm_srai_epi16(s1, 4);
-  } else {
-    s0 = _mm_srai_epi16(s0, 3);
-    s1 = _mm_srai_epi16(s1, 3);
-  }
-
-  _mm_storeu_si128((__m128i *)hbuf, s0);
-  hbuf += 8;
-  _mm_storeu_si128((__m128i *)hbuf, s1);
-}
-
-int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
-  __m128i s0 = _mm_sad_epu8(src_line, zero);
-  __m128i s1;
-  int i;
-
-  for (i = 16; i < width; i += 16) {
-    ref += 16;
-    src_line = _mm_load_si128((const __m128i *)ref);
-    s1 = _mm_sad_epu8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, s1);
-  }
-
-  s1 = _mm_srli_si128(s0, 8);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  return _mm_extract_epi16(s0, 0);
-}
-
-int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) {
-  int idx;
-  int width = 4 << bwl;
-  int16_t mean;
-  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
-  __m128i v1 = _mm_load_si128((const __m128i *)src);
-  __m128i diff = _mm_subs_epi16(v0, v1);
-  __m128i sum = diff;
-  __m128i sse = _mm_madd_epi16(diff, diff);
-
-  ref += 8;
-  src += 8;
-
-  for (idx = 8; idx < width; idx += 8) {
-    v0 = _mm_loadu_si128((const __m128i *)ref);
-    v1 = _mm_load_si128((const __m128i *)src);
-    diff = _mm_subs_epi16(v0, v1);
-
-    sum = _mm_add_epi16(sum, diff);
-    v0 = _mm_madd_epi16(diff, diff);
-    sse = _mm_add_epi32(sse, v0);
-
-    ref += 8;
-    src += 8;
-  }
-
-  v0 = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, v0);
-  v0 = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, v0);
-  v0 = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, v0);
-
-  v1 = _mm_srli_si128(sse, 8);
-  sse = _mm_add_epi32(sse, v1);
-  v1 = _mm_srli_epi64(sse, 32);
-  sse = _mm_add_epi32(sse, v1);
-
-  mean = _mm_extract_epi16(sum, 0);
-
-  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
-}
diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
deleted file mode 100644
index b2d150296..000000000
--- a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm
+++ /dev/null
@@ -1,124 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%define private_prefix aom
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the hadamard transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
-SECTION .text
-
-%if ARCH_X86_64
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro HMD8_1D 0
-  psubw              m8, m0, m1
-  psubw              m9, m2, m3
-  paddw              m0, m1
-  paddw              m2, m3
-  SWAP               1, 8
-  SWAP               3, 9
-  psubw              m8, m4, m5
-  psubw              m9, m6, m7
-  paddw              m4, m5
-  paddw              m6, m7
-  SWAP               5, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m2
-  psubw              m9, m1, m3
-  paddw              m0, m2
-  paddw              m1, m3
-  SWAP               2, 8
-  SWAP               3, 9
-  psubw              m8, m4, m6
-  psubw              m9, m5, m7
-  paddw              m4, m6
-  paddw              m5, m7
-  SWAP               6, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m4
-  psubw              m9, m1, m5
-  paddw              m0, m4
-  paddw              m1, m5
-  SWAP               4, 8
-  SWAP               5, 9
-  psubw              m8, m2, m6
-  psubw              m9, m3, m7
-  paddw              m2, m6
-  paddw              m3, m7
-  SWAP               6, 8
-  SWAP               7, 9
-%endmacro
-
-INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  HMD8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-  HMD8_1D
-
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-
-  RET
-%endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
index e916e4ff9..4f5e3f8c1 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -11,7 +11,7 @@
 
 #include "aom/aom_integer.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 // To start out, just dispatch to the function using the 2D mask and
 // pass mask stride as 0. This can be improved upon if necessary.
@@ -19,18 +19,16 @@
 void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, h, w, 0, 0);
+                            src1_stride, mask, 0, w, h, 0, 0);
 }
 
-#if CONFIG_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
-                                   src1_8, src1_stride, mask, 0, h, w, 0, 0,
+                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
                                    bd);
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
index 68d74e517..49c20b467 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
@@ -31,7 +31,7 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src0, uint32_t src0_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
+                                     int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -58,7 +58,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                      const uint8_t *src0, uint32_t src0_stride,
                                      const uint8_t *src1, uint32_t src1_stride,
                                      const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
+                                     int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -84,7 +84,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
 static void blend_a64_mask_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -119,7 +119,7 @@ static void blend_a64_mask_w16n_sse4_1(
 static void blend_a64_mask_sx_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -149,7 +149,7 @@ static void blend_a64_mask_sx_w4_sse4_1(
 static void blend_a64_mask_sx_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -179,7 +179,7 @@ static void blend_a64_mask_sx_w8_sse4_1(
 static void blend_a64_mask_sx_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -219,7 +219,7 @@ static void blend_a64_mask_sx_w16n_sse4_1(
 static void blend_a64_mask_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -248,7 +248,7 @@ static void blend_a64_mask_sy_w4_sse4_1(
 static void blend_a64_mask_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -277,7 +277,7 @@ static void blend_a64_mask_sy_w8_sse4_1(
 static void blend_a64_mask_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zero = _mm_setzero_si128();
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -315,7 +315,7 @@ static void blend_a64_mask_sy_w16n_sse4_1(
 static void blend_a64_mask_sx_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -350,7 +350,7 @@ static void blend_a64_mask_sx_sy_w4_sse4_1(
 static void blend_a64_mask_sx_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -385,7 +385,7 @@ static void blend_a64_mask_sx_sy_w8_sse4_1(
 static void blend_a64_mask_sx_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
@@ -435,12 +435,12 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                const uint8_t *src0, uint32_t src0_stride,
                                const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int h,
-                               int w, int suby, int subx) {
+                               const uint8_t *mask, uint32_t mask_stride, int w,
+                               int h, int subx, int suby) {
   typedef void (*blend_fn)(
       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
   // Dimensions are: width_index X subx X suby
   static const blend_fn blend[3][2][2] = {
@@ -465,15 +465,14 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, h, w, suby, subx);
+                         mask, mask_stride, w, h, subx, suby);
   } else {
     blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
                                               src0_stride, src1, src1_stride,
-                                              mask, mask_stride, h, w);
+                                              mask, mask_stride, w, h);
   }
 }
 
-#if CONFIG_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -503,7 +502,7 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1(
 static void blend_a64_mask_b10_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                               src1_stride, mask, mask_stride, h, blend_4_b10);
@@ -512,7 +511,7 @@ static void blend_a64_mask_b10_w4_sse4_1(
 static void blend_a64_mask_b12_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                               src1_stride, mask, mask_stride, h, blend_4_b12);
@@ -521,7 +520,7 @@ static void blend_a64_mask_b12_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -546,18 +545,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1(
 static void blend_a64_mask_b10_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
+                               src1_stride, mask, mask_stride, w, h,
                                blend_8_b10);
 }
 
 static void blend_a64_mask_b12_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
+                               src1_stride, mask, mask_stride, w, h,
                                blend_8_b12);
 }
 
@@ -594,7 +593,7 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
 static void blend_a64_mask_b10_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -604,7 +603,7 @@ static void blend_a64_mask_b10_sx_w4_sse4_1(
 static void blend_a64_mask_b12_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -614,7 +613,7 @@ static void blend_a64_mask_b12_sx_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
@@ -643,18 +642,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
 static void blend_a64_mask_b10_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b12);
 }
 
@@ -690,7 +689,7 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
 static void blend_a64_mask_b10_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -700,7 +699,7 @@ static void blend_a64_mask_b10_sy_w4_sse4_1(
 static void blend_a64_mask_b12_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                  src1_stride, mask, mask_stride, h,
@@ -710,7 +709,7 @@ static void blend_a64_mask_b12_sy_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
@@ -738,18 +737,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
 static void blend_a64_mask_b10_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
+                                  src1_stride, mask, mask_stride, w, h,
                                   blend_8_b12);
 }
 
@@ -791,7 +790,7 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
 static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -801,7 +800,7 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
 static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   (void)w;
   blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                     src1_stride, mask, mask_stride, h,
@@ -811,7 +810,7 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
 static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
                                          0xff, 0, 0xff, 0, 0xff, 0, 0xff);
@@ -845,18 +844,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
+                                     src1_stride, mask, mask_stride, w, h,
                                      blend_8_b10);
 }
 
 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
   blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
+                                     src1_stride, mask, mask_stride, w, h,
                                      blend_8_b12);
 }
 
@@ -869,12 +868,12 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
                                       uint32_t src0_stride,
                                       const uint8_t *src1_8,
                                       uint32_t src1_stride, const uint8_t *mask,
-                                      uint32_t mask_stride, int h, int w,
-                                      int suby, int subx, int bd) {
+                                      uint32_t mask_stride, int w, int h,
+                                      int subx, int suby, int bd) {
   typedef void (*blend_fn)(
       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
   // Dimensions are: bd_index X width_index X subx X suby
   static const blend_fn blend[2][2][2][2] = {
@@ -909,8 +908,8 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
   assert(bd == 8 || bd == 10 || bd == 12);
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, h, w, suby,
-                                subx, bd);
+                                src1_stride, mask, mask_stride, w, h, subx,
+                                suby, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
@@ -918,7 +917,113 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
 
     blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-        mask_stride, h, w);
+        mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0,
+                                      const CONV_BUF_TYPE *src1,
+                                      const __m128i *m,
+                                      const __m128i *v_round_offset,
+                                      const __m128i *v_maxval, int round_bits) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS);
+  const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset);
+  const __m128i res_d = xx_roundn_epi32(res_c, round_bits);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i v_ro_a = xx_loadl_32(&round_offset);
+  const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0);
+  const __m128i one_w = _mm_set1_epi16(1);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]);
+        const __m128i m = _mm_cvtepu8_epi16(m0);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 =
+            xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]);
+        const __m128i m_i1 =
+            xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+        const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b);
+        const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd);
+        const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+        const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
+        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 4) {
+        const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]);
+        const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]);
+        const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1);
+        const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b);
+        const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w);
+        const __m128i m = _mm_srli_epi16(m_ac_1, 1);
+
+        blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j],
+                           &src1[i * src1_stride + j], &m, &v_round_offset,
+                           &v_maxval, round_bits);
+      }
+    }
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
index 9dabe5b79..59506bdfe 100644
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -21,7 +21,7 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/blend_sse4.h"
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
@@ -30,7 +30,7 @@
 static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                       const uint8_t *src0, uint32_t src0_stride,
                                       const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
+                                      const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -55,7 +55,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
 static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                       const uint8_t *src0, uint32_t src0_stride,
                                       const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
+                                      const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   (void)w;
@@ -82,7 +82,7 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                         uint32_t src0_stride,
                                         const uint8_t *src1,
                                         uint32_t src1_stride,
-                                        const uint8_t *mask, int h, int w) {
+                                        const uint8_t *mask, int w, int h) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -112,11 +112,11 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
 void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                 const uint8_t *src0, uint32_t src0_stride,
                                 const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
+                                const uint8_t *mask, int w, int h) {
   typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
                            const uint8_t *src0, uint32_t src0_stride,
                            const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
+                           const uint8_t *mask, int w, int h);
 
   // Dimension: width_index
   static const blend_fn blend[9] = {
@@ -139,11 +139,10 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
-                 w);
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+                 h);
 }
 
-#if CONFIG_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -174,7 +173,7 @@ static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                           uint32_t src0_stride,
                                           const uint16_t *src1,
                                           uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
+                                          const uint8_t *mask, int w, int h) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, h, blend_4_b10);
@@ -185,7 +184,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                           uint32_t src0_stride,
                                           const uint16_t *src1,
                                           uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
+                                          const uint8_t *mask, int w, int h) {
   (void)w;
   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
                                src1_stride, mask, h, blend_4_b12);
@@ -194,7 +193,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
 static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -218,9 +217,9 @@ static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                            uint32_t src0_stride,
                                            const uint16_t *src1,
                                            uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
+                                           const uint8_t *mask, int w, int h) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b10);
+                                src1_stride, mask, w, h, blend_8_b10);
 }
 
 static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
@@ -228,9 +227,9 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
                                            uint32_t src0_stride,
                                            const uint16_t *src1,
                                            uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
+                                           const uint8_t *mask, int w, int h) {
   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b12);
+                                src1_stride, mask, w, h, blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -240,11 +239,11 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
 void aom_highbd_blend_a64_vmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
+    const uint8_t *mask, int w, int h, int bd) {
   typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
                            const uint16_t *src0, uint32_t src0_stride,
                            const uint16_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
+                           const uint8_t *mask, int w, int h);
 
   // Dimensions are: bd_index X width_index
   static const blend_fn blend[2][2] = {
@@ -272,14 +271,13 @@ void aom_highbd_blend_a64_vmask_sse4_1(
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                 src1_stride, mask, h, w, bd);
+                                 src1_stride, mask, w, h, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
     blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, h, w);
+                                  src1_stride, mask, w, h);
   }
 }
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
index daa2b2b3a..4880438bc 100644
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -53,7 +53,6 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
   return v_res_w;
 }
 
-#if CONFIG_HIGHBITDEPTH
 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w);
 
@@ -141,6 +140,5 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
 
   return v_res_w;
 }
-#endif  // CONFIG_HIGHBITDEPTH
 
 #endif  // AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
index 5f9596a74..3f46420dd 100644
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -14,7 +14,7 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 // Note: in and out could have the same value
 static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
index 8641164db..36fb1963a 100644
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -13,7 +13,8 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #include "aom_dsp/aom_convolve.h"
@@ -84,102 +85,6 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                \
-  void aom_convolve8_##avg##opt(                                             \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                   \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                   \
-    assert(w <= MAX_SB_SIZE);                                                \
-    assert(h <= MAX_SB_SIZE);                                                \
-    assert(x_step_q4 == 16);                                                 \
-    assert(y_step_q4 == 16);                                                 \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] ||          \
-        filter_y[1] || filter_y[2]) {                                        \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
-      aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2,    \
-                                MAX_SB_SIZE, filter_x, x_step_q4, filter_y,  \
-                                y_step_q4, w, h + 7);                        \
-      aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                      dst, dst_stride, filter_x, x_step_q4,  \
-                                      filter_y, y_step_q4, w, h);            \
-    } else {                                                                 \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
-      aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE,        \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                h + 1);                                      \
-      aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride,  \
-                                      filter_x, x_step_q4, filter_y,         \
-                                      y_step_q4, w, h);                      \
-    }                                                                        \
-  }
-
-#if CONFIG_LOOP_RESTORATION
-// convolve_add_src is only used by the Wiener filter, which will never
-// end up calling the bilinear functions (it uses a symmetric filter, so
-// the possible numbers of taps are 1,3,5,7)
-#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
-                                opt)                                        \
-  void aom_convolve8_##name##_##opt(                                        \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    (void)filter_x;                                                         \
-    (void)x_step_q4;                                                        \
-    (void)filter_y;                                                         \
-    (void)y_step_q4;                                                        \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                      \
-    assert(step_q4 == 16);                                                  \
-    while (w >= 16) {                                                       \
-      aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                               dst_stride, h, filter);      \
-      src += 16;                                                            \
-      dst += 16;                                                            \
-      w -= 16;                                                              \
-    }                                                                       \
-    while (w >= 8) {                                                        \
-      aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 8;                                                             \
-      dst += 8;                                                             \
-      w -= 8;                                                               \
-    }                                                                       \
-    while (w >= 4) {                                                        \
-      aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 4;                                                             \
-      dst += 4;                                                             \
-      w -= 4;                                                               \
-    }                                                                       \
-    if (w) {                                                                \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,  \
-                               x_step_q4, filter_y, y_step_q4, w, h);       \
-    }                                                                       \
-  }
-
-#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt)                           \
-  void aom_convolve8_##type##opt(                                           \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);  \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                  \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                  \
-    assert(w <= MAX_SB_SIZE);                                               \
-    assert(h <= MAX_SB_SIZE);                                               \
-    assert(x_step_q4 == 16);                                                \
-    assert(y_step_q4 == 16);                                                \
-    aom_convolve8_##htype##horiz_##opt(                                     \
-        src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x,    \
-        x_step_q4, filter_y, y_step_q4, w, h + 7);                          \
-    aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                     dst, dst_stride, filter_x, x_step_q4,  \
-                                     filter_y, y_step_q4, w, h);            \
-  }
-#endif
-
-#if CONFIG_HIGHBITDEPTH
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
@@ -248,41 +153,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     }                                                                      \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void aom_highbd_convolve8_##avg##opt(                                       \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= MAX_SB_SIZE);                                                 \
-    assert(h <= MAX_SB_SIZE);                                                 \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 ||  \
-          filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t,                                         \
-                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);             \
-        aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
-                                         CONVERT_TO_BYTEPTR(fdata2),          \
-                                         MAX_SB_SIZE, filter_x, x_step_q4,    \
-                                         filter_y, y_step_q4, w, h + 7, bd);  \
-        aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst,   \
-            dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);  \
-      } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t,                                         \
-                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]);             \
-        aom_highbd_convolve8_horiz_##opt(                                     \
-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE,         \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd);          \
-        aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride,         \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);              \
-      }                                                                       \
-    } else {                                                                  \
-      aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
-    }                                                                         \
-  }
-#endif  // CONFIG_HIGHBITDEPTH
-
 #endif  // AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..7790baf2e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+static INLINE void prepare_coeffs_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16(0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+                                     const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+                                       _mm256_add_epi16(res_23, res_67));
+
+  return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+                               const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                       _mm256_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+                                       const __m256i *const coeffs,
+                                       const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+  s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_lowbd(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+                                         const __m256i *const res,
+                                         const int do_average) {
+  __m256i d;
+  if (do_average) {
+    d = _mm256_load_si256((__m256i *)dst);
+    d = _mm256_add_epi32(d, *res);
+    d = _mm256_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+                               const __m256i *const res_unsigned,
+                               const __m256i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm256_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+                                        const __m256i *const offset_const,
+                                        const __m256i *const round_const,
+                                        const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi16(
+      _mm256_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+                                      const __m256i *const res_unsigned,
+                                      const __m256i *const wt0,
+                                      const __m256i *const wt1,
+                                      const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+    const __m256i *const res_unsigned, const __m256i *const offset_const,
+    const __m256i *const round_const, const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi32(
+      _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 000000000..e80c5872f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+                             const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(d, *res);
+    d = _mm_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 000000000..846fe7bb4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      *filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+  const __m128i res =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+                               const __m128i *const res_unsigned,
+                               const __m128i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+                                        const __m128i *const offset_const,
+                                        const __m128i *const round_const,
+                                        const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+    const __m128i *const res_unsigned, const __m128i *const offset_const,
+    const __m128i *const round_const, const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 000000000..d48c25667
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+                                  const __m128i *const res,
+                                  const __m128i *const wt0,
+                                  const __m128i *const wt1,
+                                  const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+                                             const __m128i *const res_unsigned,
+                                             const __m128i *const wt0,
+                                             const __m128i *const wt1,
+                                             const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..54da02253
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+          _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..12bdc3e18
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = _mm_sub_ps(real1, real2);
+      imag1 = _mm_add_ps(imag1, imag2);
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = _mm_add_ps(real1, real2);
+      imag1 = _mm_sub_ps(imag2, imag1);
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
deleted file mode 100644
index b8ec08de7..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c
+++ /dev/null
@@ -1,862 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its output - the caller is expected to do that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void fdct32_8col(__m128i *in0, __m128i *in1) {
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i step1[32];
-  __m128i step2[32];
-  __m128i step3[32];
-  __m128i out[32];
-  // Stage 1
-  {
-    const __m128i *ina = in0;
-    const __m128i *inb = in1 + 15;
-    __m128i *step1a = &step1[0];
-    __m128i *step1b = &step1[31];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 4;
-    const __m128i *inb = in1 + 11;
-    __m128i *step1a = &step1[4];
-    __m128i *step1b = &step1[27];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 8;
-    const __m128i *inb = in1 + 7;
-    __m128i *step1a = &step1[8];
-    __m128i *step1b = &step1[23];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 12;
-    const __m128i *inb = in1 + 3;
-    __m128i *step1a = &step1[12];
-    __m128i *step1b = &step1[19];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  // Stage 2
-  {
-    step2[0] = _mm_add_epi16(step1[0], step1[15]);
-    step2[1] = _mm_add_epi16(step1[1], step1[14]);
-    step2[2] = _mm_add_epi16(step1[2], step1[13]);
-    step2[3] = _mm_add_epi16(step1[3], step1[12]);
-    step2[4] = _mm_add_epi16(step1[4], step1[11]);
-    step2[5] = _mm_add_epi16(step1[5], step1[10]);
-    step2[6] = _mm_add_epi16(step1[6], step1[9]);
-    step2[7] = _mm_add_epi16(step1[7], step1[8]);
-    step2[8] = _mm_sub_epi16(step1[7], step1[8]);
-    step2[9] = _mm_sub_epi16(step1[6], step1[9]);
-    step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-    step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-    step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-    step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-    step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-    step2[15] = _mm_sub_epi16(step1[0], step1[15]);
-  }
-  {
-    const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-    const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-    const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-    const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-    const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-    const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-    const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-    const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-    const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-    const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-    const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-    const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-    const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-    const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-    const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-    const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-    const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-    const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-    const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-    const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-    const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-    const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-    const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-    const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-    const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-    const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-    const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-    const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-    const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-    const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-    const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-    const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-    const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-    const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-    const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-    const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-    const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-    const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-    const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-    // Combine
-    step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-    step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-    step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-    step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-    step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-    step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-    step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-    step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-  }
-  // Stage 3
-  {
-    step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-    step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-    step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-    step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-    step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-    step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-    step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-    step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
-  }
-  {
-    const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-    const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-    const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-    const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-    const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-    const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-    const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-    const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-    const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-    const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-    const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-    const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-    const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-    const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-    const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-    const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-    const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-    const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-    const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-    // Combine
-    step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-    step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-    step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-    step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-  }
-  {
-    step3[16] = _mm_add_epi16(step2[23], step1[16]);
-    step3[17] = _mm_add_epi16(step2[22], step1[17]);
-    step3[18] = _mm_add_epi16(step2[21], step1[18]);
-    step3[19] = _mm_add_epi16(step2[20], step1[19]);
-    step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-    step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-    step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-    step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-    step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-    step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-    step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-    step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-    step3[28] = _mm_add_epi16(step2[27], step1[28]);
-    step3[29] = _mm_add_epi16(step2[26], step1[29]);
-    step3[30] = _mm_add_epi16(step2[25], step1[30]);
-    step3[31] = _mm_add_epi16(step2[24], step1[31]);
-  }
-
-  // Stage 4
-  {
-    step1[0] = _mm_add_epi16(step3[3], step3[0]);
-    step1[1] = _mm_add_epi16(step3[2], step3[1]);
-    step1[2] = _mm_sub_epi16(step3[1], step3[2]);
-    step1[3] = _mm_sub_epi16(step3[0], step3[3]);
-    step1[8] = _mm_add_epi16(step3[11], step2[8]);
-    step1[9] = _mm_add_epi16(step3[10], step2[9]);
-    step1[10] = _mm_sub_epi16(step2[9], step3[10]);
-    step1[11] = _mm_sub_epi16(step2[8], step3[11]);
-    step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-    step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-    step1[14] = _mm_add_epi16(step3[13], step2[14]);
-    step1[15] = _mm_add_epi16(step3[12], step2[15]);
-  }
-  {
-    const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-    const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-    const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-    const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-    const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-    const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-    const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-    const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-    const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-    // Combine
-    step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-    step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-  }
-  {
-    const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-    const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-    const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-    const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-    const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-    const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-    const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-    const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-    const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-    const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-    const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-    const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-    const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-    const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-    const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-    const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-    const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-    const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-    const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-    const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-    const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-    const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-    const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-    const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-    const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-    const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-    const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-    const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-    const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-    const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-    const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-    const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-    const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-    const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-    const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-    const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-    const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-    const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-    const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-    // Combine
-    step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-    step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-    step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-    step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-    step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-    step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-    step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-    step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-  }
-  // Stage 5
-  {
-    step2[4] = _mm_add_epi16(step1[5], step3[4]);
-    step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-    step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-    step2[7] = _mm_add_epi16(step1[6], step3[7]);
-  }
-  {
-    const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-    const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-    const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-    const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-    const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-    const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-    const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-    const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-    const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-    const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-    const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-    const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-    // dct_const_round_shift
-    const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-    const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-    const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-    const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-    const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-    const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-    const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-    const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-    // Combine
-    out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-    out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-    out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-    out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-  }
-  {
-    const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-    const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-    const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-    const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-    const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-    const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-    const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-    const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-    const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-    const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-    const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-    const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-    const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-    const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-    const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-    const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-    const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-    const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-    const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-    // Combine
-    step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-    step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-    step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-    step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-  }
-  {
-    step2[16] = _mm_add_epi16(step1[19], step3[16]);
-    step2[17] = _mm_add_epi16(step1[18], step3[17]);
-    step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-    step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-    step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-    step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-    step2[22] = _mm_add_epi16(step1[21], step3[22]);
-    step2[23] = _mm_add_epi16(step1[20], step3[23]);
-    step2[24] = _mm_add_epi16(step1[27], step3[24]);
-    step2[25] = _mm_add_epi16(step1[26], step3[25]);
-    step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-    step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-    step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-    step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-    step2[30] = _mm_add_epi16(step1[29], step3[30]);
-    step2[31] = _mm_add_epi16(step1[28], step3[31]);
-  }
-  // Stage 6
-  {
-    const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-    const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-    const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-    const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-    const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-    const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-    const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-    const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-    // dct_const_round_shift
-    const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-    const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-    const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-    const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-    const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-    const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-    const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-    const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-    // Combine
-    out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-    out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-    out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-    out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-  }
-  {
-    step3[8] = _mm_add_epi16(step2[9], step1[8]);
-    step3[9] = _mm_sub_epi16(step1[8], step2[9]);
-    step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-    step3[11] = _mm_add_epi16(step2[10], step1[11]);
-    step3[12] = _mm_add_epi16(step2[13], step1[12]);
-    step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-    step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-    step3[15] = _mm_add_epi16(step2[14], step1[15]);
-  }
-  {
-    const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-    const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-    const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-    const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-    const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-    const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-    const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-    const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-    const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-    const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-    const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-    const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-    const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-    const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-    const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-    const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-    const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-    const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-    const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-    const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-    const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-    const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-    const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-    const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-    // dct_const_round_shift
-    const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-    const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-    const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-    const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-    const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-    const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-    const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-    const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-    const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-    const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-    const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-    const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-    const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-    const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-    const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-    const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-    // Combine
-    step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-    step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-    step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-    step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-    // Combine
-    step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-    step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-    step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-    step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-  }
-  // Stage 7
-  {
-    const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-    const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-    const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-    const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-    const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-    const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-    const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-    const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-    const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-    const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-    const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-    const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-    const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-    const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-    const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-    const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-    const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-    const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-    const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-    const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-    const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-    const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-    const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-    const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-    // dct_const_round_shift
-    const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-    const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-    const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-    const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-    const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-    const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-    const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-    const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-    const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-    const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-    const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-    const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-    const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-    const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-    const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-    const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-    // Combine
-    out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-    out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-    out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-    out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-    out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-    out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-    out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-    out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-  }
-  {
-    step1[16] = _mm_add_epi16(step3[17], step2[16]);
-    step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-    step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-    step1[19] = _mm_add_epi16(step3[18], step2[19]);
-    step1[20] = _mm_add_epi16(step3[21], step2[20]);
-    step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-    step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-    step1[23] = _mm_add_epi16(step3[22], step2[23]);
-    step1[24] = _mm_add_epi16(step3[25], step2[24]);
-    step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-    step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-    step1[27] = _mm_add_epi16(step3[26], step2[27]);
-    step1[28] = _mm_add_epi16(step3[29], step2[28]);
-    step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-    step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-    step1[31] = _mm_add_epi16(step3[30], step2[31]);
-  }
-  // Final stage --- outputs indices are bit-reversed.
-  {
-    const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-    const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-    const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-    const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-    const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-    const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-    const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-    const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-    const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-    const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-    const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-    const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-    const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-    const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-    const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-    const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-    const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-    const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-    const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-    const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-    const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-    const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-    const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-    const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-    // dct_const_round_shift
-    const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-    const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-    const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-    const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-    const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-    const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-    const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-    const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-    const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-    const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-    const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-    const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-    const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-    const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-    const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-    const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-    // Combine
-    out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-    out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-    out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-    out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-    out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-    out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-    out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-    out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-  }
-  {
-    const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-    const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-    const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-    const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-    const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-    const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-    const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-    const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-    const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-    const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-    const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-    const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-    const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-    const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-    const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-    const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-    const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-    const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-    const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-    const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-    const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-    const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-    const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-    const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-    // dct_const_round_shift
-    const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-    const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-    const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-    const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-    const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-    const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-    const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-    const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-    const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-    const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-    const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-    const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-    const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-    const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-    const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-    const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-    // Combine
-    out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-    out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-    out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-    out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-    out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-    out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-    out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-    out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-  }
-
-  // Output results
-  {
-    int j;
-    for (j = 0; j < 16; ++j) {
-      _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
-      _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
-    }
-  }
-}  // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
deleted file mode 100644
index 216739581..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ /dev/null
@@ -1,3022 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_intrin.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-#if FDCT32x32_HIGH_PRECISION
-static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
-  __m256i buf0, buf1;
-  buf0 = _mm256_mul_epu32(a, b);
-  a = _mm256_srli_epi64(a, 32);
-  b = _mm256_srli_epi64(b, 32);
-  buf1 = _mm256_mul_epu32(a, b);
-  return _mm256_add_epi64(buf0, buf1);
-}
-
-static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
-  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm256_unpacklo_epi64(buf0, buf1);
-}
-#endif
-
-#ifndef STORE_COEFF_FUNC
-#define STORE_COEFF_FUNC
-static void store_coeff(const __m256i *coeff, tran_low_t *curr,
-                        tran_low_t *next) {
-  __m128i u = _mm256_castsi256_si128(*coeff);
-  storeu_output(&u, curr);
-  u = _mm256_extractf128_si256(*coeff, 1);
-  storeu_output(&u, next);
-}
-#endif
-
-void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
-                       int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i k__cospi_p16_m16 =
-      pair256_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i k__cospi_m12_m20 =
-      pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
-  const __m256i kOne = _mm256_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-  for (pass = 0; pass < 2; ++pass) {
-    // We process sixteen columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 16) {
-      __m256i step1[32];
-      __m256i step2[32];
-      __m256i step3[32];
-      __m256i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m256i *step1a = &step1[0];
-          __m256i *step1b = &step1[31];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m256i *step1a = &step1[4];
-          __m256i *step1b = &step1[27];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m256i *step1a = &step1[8];
-          __m256i *step1b = &step1[23];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m256i *step1a = &step1[12];
-          __m256i *step1b = &step1[19];
-          const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina));
-          const __m256i ina1 =
-              _mm256_loadu_si256((const __m256i *)(ina + str1));
-          const __m256i ina2 =
-              _mm256_loadu_si256((const __m256i *)(ina + str2));
-          const __m256i ina3 =
-              _mm256_loadu_si256((const __m256i *)(ina + str3));
-          const __m256i inb3 =
-              _mm256_loadu_si256((const __m256i *)(inb - str3));
-          const __m256i inb2 =
-              _mm256_loadu_si256((const __m256i *)(inb - str2));
-          const __m256i inb1 =
-              _mm256_loadu_si256((const __m256i *)(inb - str1));
-          const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb));
-          step1a[0] = _mm256_add_epi16(ina0, inb0);
-          step1a[1] = _mm256_add_epi16(ina1, inb1);
-          step1a[2] = _mm256_add_epi16(ina2, inb2);
-          step1a[3] = _mm256_add_epi16(ina3, inb3);
-          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
-          step1a[0] = _mm256_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm256_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm256_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm256_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32));
-          __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32));
-          __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32));
-          __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32));
-          __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
-          __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
-          __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
-          __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
-          step1[0] = _mm256_add_epi16(in00, in31);
-          step1[1] = _mm256_add_epi16(in01, in30);
-          step1[2] = _mm256_add_epi16(in02, in29);
-          step1[3] = _mm256_add_epi16(in03, in28);
-          step1[28] = _mm256_sub_epi16(in03, in28);
-          step1[29] = _mm256_sub_epi16(in02, in29);
-          step1[30] = _mm256_sub_epi16(in01, in30);
-          step1[31] = _mm256_sub_epi16(in00, in31);
-        }
-        {
-          __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32));
-          __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32));
-          __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32));
-          __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32));
-          __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
-          __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
-          __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
-          __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
-          step1[4] = _mm256_add_epi16(in04, in27);
-          step1[5] = _mm256_add_epi16(in05, in26);
-          step1[6] = _mm256_add_epi16(in06, in25);
-          step1[7] = _mm256_add_epi16(in07, in24);
-          step1[24] = _mm256_sub_epi16(in07, in24);
-          step1[25] = _mm256_sub_epi16(in06, in25);
-          step1[26] = _mm256_sub_epi16(in05, in26);
-          step1[27] = _mm256_sub_epi16(in04, in27);
-        }
-        {
-          __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32));
-          __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32));
-          __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
-          __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
-          __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
-          __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
-          __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
-          __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
-          step1[8] = _mm256_add_epi16(in08, in23);
-          step1[9] = _mm256_add_epi16(in09, in22);
-          step1[10] = _mm256_add_epi16(in10, in21);
-          step1[11] = _mm256_add_epi16(in11, in20);
-          step1[20] = _mm256_sub_epi16(in11, in20);
-          step1[21] = _mm256_sub_epi16(in10, in21);
-          step1[22] = _mm256_sub_epi16(in09, in22);
-          step1[23] = _mm256_sub_epi16(in08, in23);
-        }
-        {
-          __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
-          __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
-          __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
-          __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
-          __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
-          __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
-          __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
-          __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
-          step1[12] = _mm256_add_epi16(in12, in19);
-          step1[13] = _mm256_add_epi16(in13, in18);
-          step1[14] = _mm256_add_epi16(in14, in17);
-          step1[15] = _mm256_add_epi16(in15, in16);
-          step1[16] = _mm256_sub_epi16(in15, in16);
-          step1[17] = _mm256_sub_epi16(in14, in17);
-          step1[18] = _mm256_sub_epi16(in13, in18);
-          step1[19] = _mm256_sub_epi16(in12, in19);
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = _mm256_add_epi16(step1[0], step1[15]);
-        step2[1] = _mm256_add_epi16(step1[1], step1[14]);
-        step2[2] = _mm256_add_epi16(step1[2], step1[13]);
-        step2[3] = _mm256_add_epi16(step1[3], step1[12]);
-        step2[4] = _mm256_add_epi16(step1[4], step1[11]);
-        step2[5] = _mm256_add_epi16(step1[5], step1[10]);
-        step2[6] = _mm256_add_epi16(step1[6], step1[9]);
-        step2[7] = _mm256_add_epi16(step1[7], step1[8]);
-        step2[8] = _mm256_sub_epi16(step1[7], step1[8]);
-        step2[9] = _mm256_sub_epi16(step1[6], step1[9]);
-        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
-        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
-        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
-        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
-        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
-        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
-      }
-      {
-        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
-        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
-        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
-        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
-        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
-        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
-        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
-        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
-        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m256i s2_20_4 =
-            _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_5 =
-            _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_4 =
-            _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_21_5 =
-            _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_4 =
-            _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_22_5 =
-            _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_4 =
-            _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_23_5 =
-            _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_4 =
-            _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_24_5 =
-            _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_4 =
-            _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_25_5 =
-            _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_4 =
-            _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_26_5 =
-            _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_4 =
-            _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m256i s2_27_5 =
-            _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]);
-        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]);
-        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]);
-        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]);
-        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]);
-        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]);
-        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]);
-        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]);
-        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]);
-        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]);
-        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]);
-        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]);
-        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]);
-        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]);
-        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]);
-        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]);
-        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]);
-        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]);
-        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]);
-        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]);
-        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]);
-        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]);
-        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]);
-        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]);
-        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]);
-        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]);
-        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]);
-        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]);
-        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]);
-        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]);
-        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]);
-        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]);
-
-        step2[0] = _mm256_sub_epi16(step2[0], s3_00_0);
-        step2[1] = _mm256_sub_epi16(step2[1], s3_01_0);
-        step2[2] = _mm256_sub_epi16(step2[2], s3_02_0);
-        step2[3] = _mm256_sub_epi16(step2[3], s3_03_0);
-        step2[4] = _mm256_sub_epi16(step2[4], s3_04_0);
-        step2[5] = _mm256_sub_epi16(step2[5], s3_05_0);
-        step2[6] = _mm256_sub_epi16(step2[6], s3_06_0);
-        step2[7] = _mm256_sub_epi16(step2[7], s3_07_0);
-        step2[8] = _mm256_sub_epi16(step2[8], s2_08_0);
-        step2[9] = _mm256_sub_epi16(step2[9], s2_09_0);
-        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
-        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
-        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
-        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
-        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
-        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
-        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
-        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
-        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
-        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
-        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
-        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
-        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
-        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
-        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
-        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
-        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
-        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
-        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
-        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
-        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
-
-        step2[0] = _mm256_add_epi16(step2[0], kOne);
-        step2[1] = _mm256_add_epi16(step2[1], kOne);
-        step2[2] = _mm256_add_epi16(step2[2], kOne);
-        step2[3] = _mm256_add_epi16(step2[3], kOne);
-        step2[4] = _mm256_add_epi16(step2[4], kOne);
-        step2[5] = _mm256_add_epi16(step2[5], kOne);
-        step2[6] = _mm256_add_epi16(step2[6], kOne);
-        step2[7] = _mm256_add_epi16(step2[7], kOne);
-        step2[8] = _mm256_add_epi16(step2[8], kOne);
-        step2[9] = _mm256_add_epi16(step2[9], kOne);
-        step2[10] = _mm256_add_epi16(step2[10], kOne);
-        step2[11] = _mm256_add_epi16(step2[11], kOne);
-        step2[12] = _mm256_add_epi16(step2[12], kOne);
-        step2[13] = _mm256_add_epi16(step2[13], kOne);
-        step2[14] = _mm256_add_epi16(step2[14], kOne);
-        step2[15] = _mm256_add_epi16(step2[15], kOne);
-        step1[16] = _mm256_add_epi16(step1[16], kOne);
-        step1[17] = _mm256_add_epi16(step1[17], kOne);
-        step1[18] = _mm256_add_epi16(step1[18], kOne);
-        step1[19] = _mm256_add_epi16(step1[19], kOne);
-        step2[20] = _mm256_add_epi16(step2[20], kOne);
-        step2[21] = _mm256_add_epi16(step2[21], kOne);
-        step2[22] = _mm256_add_epi16(step2[22], kOne);
-        step2[23] = _mm256_add_epi16(step2[23], kOne);
-        step2[24] = _mm256_add_epi16(step2[24], kOne);
-        step2[25] = _mm256_add_epi16(step2[25], kOne);
-        step2[26] = _mm256_add_epi16(step2[26], kOne);
-        step2[27] = _mm256_add_epi16(step2[27], kOne);
-        step1[28] = _mm256_add_epi16(step1[28], kOne);
-        step1[29] = _mm256_add_epi16(step1[29], kOne);
-        step1[30] = _mm256_add_epi16(step1[30], kOne);
-        step1[31] = _mm256_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm256_srai_epi16(step2[0], 2);
-        step2[1] = _mm256_srai_epi16(step2[1], 2);
-        step2[2] = _mm256_srai_epi16(step2[2], 2);
-        step2[3] = _mm256_srai_epi16(step2[3], 2);
-        step2[4] = _mm256_srai_epi16(step2[4], 2);
-        step2[5] = _mm256_srai_epi16(step2[5], 2);
-        step2[6] = _mm256_srai_epi16(step2[6], 2);
-        step2[7] = _mm256_srai_epi16(step2[7], 2);
-        step2[8] = _mm256_srai_epi16(step2[8], 2);
-        step2[9] = _mm256_srai_epi16(step2[9], 2);
-        step2[10] = _mm256_srai_epi16(step2[10], 2);
-        step2[11] = _mm256_srai_epi16(step2[11], 2);
-        step2[12] = _mm256_srai_epi16(step2[12], 2);
-        step2[13] = _mm256_srai_epi16(step2[13], 2);
-        step2[14] = _mm256_srai_epi16(step2[14], 2);
-        step2[15] = _mm256_srai_epi16(step2[15], 2);
-        step1[16] = _mm256_srai_epi16(step1[16], 2);
-        step1[17] = _mm256_srai_epi16(step1[17], 2);
-        step1[18] = _mm256_srai_epi16(step1[18], 2);
-        step1[19] = _mm256_srai_epi16(step1[19], 2);
-        step2[20] = _mm256_srai_epi16(step2[20], 2);
-        step2[21] = _mm256_srai_epi16(step2[21], 2);
-        step2[22] = _mm256_srai_epi16(step2[22], 2);
-        step2[23] = _mm256_srai_epi16(step2[23], 2);
-        step2[24] = _mm256_srai_epi16(step2[24], 2);
-        step2[25] = _mm256_srai_epi16(step2[25], 2);
-        step2[26] = _mm256_srai_epi16(step2[26], 2);
-        step2[27] = _mm256_srai_epi16(step2[27], 2);
-        step1[28] = _mm256_srai_epi16(step1[28], 2);
-        step1[29] = _mm256_srai_epi16(step1[29], 2);
-        step1[30] = _mm256_srai_epi16(step1[30], 2);
-        step1[31] = _mm256_srai_epi16(step1[31], 2);
-      }
-#endif
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
-          step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
-          step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
-          step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
-          step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
-          step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
-          step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
-          step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
-        }
-        {
-          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s3_10_4 =
-              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 =
-              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 =
-              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 =
-              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 =
-              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 =
-              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 =
-              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 =
-              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
-        }
-        {
-          step3[16] = _mm256_add_epi16(step2[23], step1[16]);
-          step3[17] = _mm256_add_epi16(step2[22], step1[17]);
-          step3[18] = _mm256_add_epi16(step2[21], step1[18]);
-          step3[19] = _mm256_add_epi16(step2[20], step1[19]);
-          step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
-          step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
-          step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
-          step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
-          step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
-          step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
-          step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
-          step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
-          step3[28] = _mm256_add_epi16(step2[27], step1[28]);
-          step3[29] = _mm256_add_epi16(step2[26], step1[29]);
-          step3[30] = _mm256_add_epi16(step2[25], step1[30]);
-          step3[31] = _mm256_add_epi16(step2[24], step1[31]);
-        }
-
-        // Stage 4
-        {
-          step1[0] = _mm256_add_epi16(step3[3], step3[0]);
-          step1[1] = _mm256_add_epi16(step3[2], step3[1]);
-          step1[2] = _mm256_sub_epi16(step3[1], step3[2]);
-          step1[3] = _mm256_sub_epi16(step3[0], step3[3]);
-          step1[8] = _mm256_add_epi16(step3[11], step2[8]);
-          step1[9] = _mm256_add_epi16(step3[10], step2[9]);
-          step1[10] = _mm256_sub_epi16(step2[9], step3[10]);
-          step1[11] = _mm256_sub_epi16(step2[8], step3[11]);
-          step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
-          step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
-          step1[14] = _mm256_add_epi16(step3[13], step2[14]);
-          step1[15] = _mm256_add_epi16(step3[12], step2[15]);
-        }
-        {
-          const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
-          const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
-          const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s1_05_4 =
-              _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_05_5 =
-              _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_06_4 =
-              _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_06_5 =
-              _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
-        }
-        {
-          const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
-          const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
-          const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
-          const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
-          const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
-          const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
-          const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
-          const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
-          const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m256i s1_18_4 =
-              _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_18_5 =
-              _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_19_4 =
-              _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_19_5 =
-              _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_20_4 =
-              _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_20_5 =
-              _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_21_4 =
-              _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_21_5 =
-              _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_26_4 =
-              _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_26_5 =
-              _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_27_4 =
-              _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_27_5 =
-              _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_28_4 =
-              _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_28_5 =
-              _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_29_4 =
-              _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i s1_29_5 =
-              _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
-        }
-        // Stage 5
-        {
-          step2[4] = _mm256_add_epi16(step1[5], step3[4]);
-          step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
-          step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
-          step2[7] = _mm256_add_epi16(step1[6], step3[7]);
-        }
-        {
-          const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
-          const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
-          const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
-          const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
-          const __m256i out_00_2 =
-              _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m256i out_00_3 =
-              _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m256i out_16_2 =
-              _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m256i out_16_3 =
-              _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m256i out_08_2 =
-              _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m256i out_08_3 =
-              _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m256i out_24_2 =
-              _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m256i out_24_3 =
-              _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m256i out_00_4 =
-              _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_00_5 =
-              _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_16_4 =
-              _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_16_5 =
-              _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_08_4 =
-              _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_08_5 =
-              _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_24_4 =
-              _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_24_5 =
-              _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm256_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm256_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
-        }
-        {
-          const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]);
-          const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]);
-          const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
-          const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
-          const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m256i s2_09_4 =
-              _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_09_5 =
-              _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_10_4 =
-              _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_10_5 =
-              _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_13_4 =
-              _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_13_5 =
-              _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_14_4 =
-              _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m256i s2_14_5 =
-              _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
-        }
-        {
-          step2[16] = _mm256_add_epi16(step1[19], step3[16]);
-          step2[17] = _mm256_add_epi16(step1[18], step3[17]);
-          step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
-          step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
-          step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
-          step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
-          step2[22] = _mm256_add_epi16(step1[21], step3[22]);
-          step2[23] = _mm256_add_epi16(step1[20], step3[23]);
-          step2[24] = _mm256_add_epi16(step1[27], step3[24]);
-          step2[25] = _mm256_add_epi16(step1[26], step3[25]);
-          step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
-          step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
-          step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
-          step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
-          step2[30] = _mm256_add_epi16(step1[29], step3[30]);
-          step2[31] = _mm256_add_epi16(step1[28], step3[31]);
-        }
-        // Stage 6
-        {
-          const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-          const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-          const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-          const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-          const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
-          const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
-          const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
-          const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
-          const __m256i out_04_2 =
-              _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m256i out_04_3 =
-              _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m256i out_20_2 =
-              _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m256i out_20_3 =
-              _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m256i out_12_2 =
-              _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m256i out_12_3 =
-              _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m256i out_28_2 =
-              _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m256i out_28_3 =
-              _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m256i out_04_4 =
-              _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_04_5 =
-              _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_20_4 =
-              _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_20_5 =
-              _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_12_4 =
-              _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_12_5 =
-              _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_28_4 =
-              _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_28_5 =
-              _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm256_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
-        }
-        {
-          step3[8] = _mm256_add_epi16(step2[9], step1[8]);
-          step3[9] = _mm256_sub_epi16(step1[8], step2[9]);
-          step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
-          step3[11] = _mm256_add_epi16(step2[10], step1[11]);
-          step3[12] = _mm256_add_epi16(step2[13], step1[12]);
-          step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
-          step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
-          step3[15] = _mm256_add_epi16(step2[14], step1[15]);
-        }
-        {
-          const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
-          const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
-          const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
-          const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
-          const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
-          const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
-          const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
-          const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
-          const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m256i s3_17_4 =
-              _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_17_5 =
-              _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_18_4 =
-              _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_18_5 =
-              _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_21_4 =
-              _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_21_5 =
-              _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_22_4 =
-              _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_22_5 =
-              _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m256i s3_25_4 =
-              _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_25_5 =
-              _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_26_4 =
-              _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_26_5 =
-              _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_29_4 =
-              _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_29_5 =
-              _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_30_4 =
-              _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_30_5 =
-              _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
-        }
-        // Stage 7
-        {
-          const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]);
-          const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]);
-          const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]);
-          const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]);
-          const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
-          const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
-          const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
-          const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
-          const __m256i out_02_2 =
-              _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m256i out_02_3 =
-              _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m256i out_18_2 =
-              _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m256i out_18_3 =
-              _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m256i out_10_2 =
-              _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m256i out_10_3 =
-              _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m256i out_26_2 =
-              _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m256i out_26_3 =
-              _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m256i out_06_2 =
-              _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m256i out_06_3 =
-              _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m256i out_22_2 =
-              _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m256i out_22_3 =
-              _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m256i out_14_2 =
-              _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m256i out_14_3 =
-              _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m256i out_30_2 =
-              _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m256i out_30_3 =
-              _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m256i out_02_4 =
-              _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_02_5 =
-              _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_18_4 =
-              _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_18_5 =
-              _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_10_4 =
-              _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_10_5 =
-              _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_26_4 =
-              _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_26_5 =
-              _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_06_4 =
-              _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_06_5 =
-              _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_22_4 =
-              _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_22_5 =
-              _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_14_4 =
-              _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_14_5 =
-              _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_30_4 =
-              _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_30_5 =
-              _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm256_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm256_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
-        }
-        {
-          step1[16] = _mm256_add_epi16(step3[17], step2[16]);
-          step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
-          step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
-          step1[19] = _mm256_add_epi16(step3[18], step2[19]);
-          step1[20] = _mm256_add_epi16(step3[21], step2[20]);
-          step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
-          step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
-          step1[23] = _mm256_add_epi16(step3[22], step2[23]);
-          step1[24] = _mm256_add_epi16(step3[25], step2[24]);
-          step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
-          step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
-          step1[27] = _mm256_add_epi16(step3[26], step2[27]);
-          step1[28] = _mm256_add_epi16(step3[29], step2[28]);
-          step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
-          step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
-          step1[31] = _mm256_add_epi16(step3[30], step2[31]);
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
-          const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
-          const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
-          const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
-          const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
-          const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
-          const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
-          const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
-          const __m256i out_01_2 =
-              _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m256i out_01_3 =
-              _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m256i out_17_2 =
-              _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m256i out_17_3 =
-              _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m256i out_09_2 =
-              _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m256i out_09_3 =
-              _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m256i out_25_2 =
-              _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m256i out_25_3 =
-              _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m256i out_07_2 =
-              _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m256i out_07_3 =
-              _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m256i out_23_2 =
-              _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m256i out_23_3 =
-              _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m256i out_15_2 =
-              _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m256i out_15_3 =
-              _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m256i out_31_2 =
-              _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m256i out_31_3 =
-              _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m256i out_01_4 =
-              _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_01_5 =
-              _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_17_4 =
-              _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_17_5 =
-              _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_09_4 =
-              _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_09_5 =
-              _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_25_4 =
-              _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_25_5 =
-              _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_07_4 =
-              _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_07_5 =
-              _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_23_4 =
-              _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_23_5 =
-              _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_15_4 =
-              _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_15_5 =
-              _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_31_4 =
-              _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_31_5 =
-              _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm256_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm256_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm256_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
-        }
-        {
-          const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
-          const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
-          const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
-          const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
-          const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
-          const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
-          const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
-          const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
-          const __m256i out_05_2 =
-              _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m256i out_05_3 =
-              _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m256i out_21_2 =
-              _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m256i out_21_3 =
-              _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m256i out_13_2 =
-              _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m256i out_13_3 =
-              _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m256i out_29_2 =
-              _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m256i out_29_3 =
-              _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m256i out_03_2 =
-              _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m256i out_03_3 =
-              _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m256i out_19_2 =
-              _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m256i out_19_3 =
-              _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m256i out_11_2 =
-              _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m256i out_11_3 =
-              _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m256i out_27_2 =
-              _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m256i out_27_3 =
-              _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m256i out_05_4 =
-              _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_05_5 =
-              _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_21_4 =
-              _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_21_5 =
-              _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_13_4 =
-              _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_13_5 =
-              _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_29_4 =
-              _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_29_5 =
-              _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_03_4 =
-              _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_03_5 =
-              _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_19_4 =
-              _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_19_5 =
-              _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_11_4 =
-              _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_11_5 =
-              _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_27_4 =
-              _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m256i out_27_5 =
-              _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm256_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm256_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m256i lstep1[64], lstep2[64], lstep3[64];
-        __m256i u[32], v[32], sign[16];
-        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
-          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
-          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
-          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
-          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m256i s3_10_4 =
-              _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_10_5 =
-              _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_4 =
-              _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_11_5 =
-              _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_4 =
-              _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_12_5 =
-              _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_4 =
-              _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m256i s3_13_5 =
-              _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m256i k32_p16_p16 =
-              pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 =
-              pair256_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 =
-              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m256i k32_p16_p16 =
-              pair256_set_epi32(cospi_16_64, cospi_16_64);
-          const __m256i k32_p16_m16 =
-              pair256_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-
-          u[0] = _mm256_sub_epi32(u[0], sign[0]);
-          u[1] = _mm256_sub_epi32(u[1], sign[1]);
-          u[2] = _mm256_sub_epi32(u[2], sign[2]);
-          u[3] = _mm256_sub_epi32(u[3], sign[3]);
-          u[4] = _mm256_sub_epi32(u[4], sign[4]);
-          u[5] = _mm256_sub_epi32(u[5], sign[5]);
-          u[6] = _mm256_sub_epi32(u[6], sign[6]);
-          u[7] = _mm256_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm256_add_epi32(u[0], K32One);
-          u[1] = _mm256_add_epi32(u[1], K32One);
-          u[2] = _mm256_add_epi32(u[2], K32One);
-          u[3] = _mm256_add_epi32(u[3], K32One);
-          u[4] = _mm256_add_epi32(u[4], K32One);
-          u[5] = _mm256_add_epi32(u[5], K32One);
-          u[6] = _mm256_add_epi32(u[6], K32One);
-          u[7] = _mm256_add_epi32(u[7], K32One);
-
-          u[0] = _mm256_srai_epi32(u[0], 2);
-          u[1] = _mm256_srai_epi32(u[1], 2);
-          u[2] = _mm256_srai_epi32(u[2], 2);
-          u[3] = _mm256_srai_epi32(u[3], 2);
-          u[4] = _mm256_srai_epi32(u[4], 2);
-          u[5] = _mm256_srai_epi32(u[5], 2);
-          u[6] = _mm256_srai_epi32(u[6], 2);
-          u[7] = _mm256_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm256_packs_epi32(u[0], u[1]);
-          out[16] = _mm256_packs_epi32(u[2], u[3]);
-          out[8] = _mm256_packs_epi32(u[4], u[5]);
-          out[24] = _mm256_packs_epi32(u[6], u[7]);
-        }
-        {
-          const __m256i k32_m08_p24 =
-              pair256_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m256i k32_m24_m08 =
-              pair256_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m256i k32_p24_p08 =
-              pair256_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m256i k32_p28_p04 =
-              pair256_set_epi32(cospi_28_64, cospi_4_64);
-          const __m256i k32_p12_p20 =
-              pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_m20_p12 =
-              pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m04_p28 =
-              pair256_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          sign[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          sign[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          sign[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          sign[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          sign[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          sign[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          sign[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-
-          u[0] = _mm256_sub_epi32(u[0], sign[0]);
-          u[1] = _mm256_sub_epi32(u[1], sign[1]);
-          u[2] = _mm256_sub_epi32(u[2], sign[2]);
-          u[3] = _mm256_sub_epi32(u[3], sign[3]);
-          u[4] = _mm256_sub_epi32(u[4], sign[4]);
-          u[5] = _mm256_sub_epi32(u[5], sign[5]);
-          u[6] = _mm256_sub_epi32(u[6], sign[6]);
-          u[7] = _mm256_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm256_add_epi32(u[0], K32One);
-          u[1] = _mm256_add_epi32(u[1], K32One);
-          u[2] = _mm256_add_epi32(u[2], K32One);
-          u[3] = _mm256_add_epi32(u[3], K32One);
-          u[4] = _mm256_add_epi32(u[4], K32One);
-          u[5] = _mm256_add_epi32(u[5], K32One);
-          u[6] = _mm256_add_epi32(u[6], K32One);
-          u[7] = _mm256_add_epi32(u[7], K32One);
-
-          u[0] = _mm256_srai_epi32(u[0], 2);
-          u[1] = _mm256_srai_epi32(u[1], 2);
-          u[2] = _mm256_srai_epi32(u[2], 2);
-          u[3] = _mm256_srai_epi32(u[3], 2);
-          u[4] = _mm256_srai_epi32(u[4], 2);
-          u[5] = _mm256_srai_epi32(u[5], 2);
-          u[6] = _mm256_srai_epi32(u[6], 2);
-          u[7] = _mm256_srai_epi32(u[7], 2);
-
-          out[4] = _mm256_packs_epi32(u[0], u[1]);
-          out[20] = _mm256_packs_epi32(u[2], u[3]);
-          out[12] = _mm256_packs_epi32(u[4], u[5]);
-          out[28] = _mm256_packs_epi32(u[6], u[7]);
-        }
-        {
-          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m256i k32_m04_p28 =
-              pair256_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m256i k32_m28_m04 =
-              pair256_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m256i k32_m20_p12 =
-              pair256_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m256i k32_m12_m20 =
-              pair256_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m256i k32_p12_p20 =
-              pair256_set_epi32(cospi_12_64, cospi_20_64);
-          const __m256i k32_p28_p04 =
-              pair256_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m256i k32_p30_p02 =
-              pair256_set_epi32(cospi_30_64, cospi_2_64);
-          const __m256i k32_p14_p18 =
-              pair256_set_epi32(cospi_14_64, cospi_18_64);
-          const __m256i k32_p22_p10 =
-              pair256_set_epi32(cospi_22_64, cospi_10_64);
-          const __m256i k32_p06_p26 =
-              pair256_set_epi32(cospi_6_64, cospi_26_64);
-          const __m256i k32_m26_p06 =
-              pair256_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m256i k32_m10_p22 =
-              pair256_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m256i k32_m18_p14 =
-              pair256_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m256i k32_m02_p30 =
-              pair256_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[2] = _mm256_packs_epi32(u[0], u[1]);
-          out[18] = _mm256_packs_epi32(u[2], u[3]);
-          out[10] = _mm256_packs_epi32(u[4], u[5]);
-          out[26] = _mm256_packs_epi32(u[6], u[7]);
-          out[6] = _mm256_packs_epi32(u[8], u[9]);
-          out[22] = _mm256_packs_epi32(u[10], u[11]);
-          out[14] = _mm256_packs_epi32(u[12], u[13]);
-          out[30] = _mm256_packs_epi32(u[14], u[15]);
-        }
-        {
-          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m256i k32_p31_p01 =
-              pair256_set_epi32(cospi_31_64, cospi_1_64);
-          const __m256i k32_p15_p17 =
-              pair256_set_epi32(cospi_15_64, cospi_17_64);
-          const __m256i k32_p23_p09 =
-              pair256_set_epi32(cospi_23_64, cospi_9_64);
-          const __m256i k32_p07_p25 =
-              pair256_set_epi32(cospi_7_64, cospi_25_64);
-          const __m256i k32_m25_p07 =
-              pair256_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m256i k32_m09_p23 =
-              pair256_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m256i k32_m17_p15 =
-              pair256_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m256i k32_m01_p31 =
-              pair256_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[1] = _mm256_packs_epi32(u[0], u[1]);
-          out[17] = _mm256_packs_epi32(u[2], u[3]);
-          out[9] = _mm256_packs_epi32(u[4], u[5]);
-          out[25] = _mm256_packs_epi32(u[6], u[7]);
-          out[7] = _mm256_packs_epi32(u[8], u[9]);
-          out[23] = _mm256_packs_epi32(u[10], u[11]);
-          out[15] = _mm256_packs_epi32(u[12], u[13]);
-          out[31] = _mm256_packs_epi32(u[14], u[15]);
-        }
-        {
-          const __m256i k32_p27_p05 =
-              pair256_set_epi32(cospi_27_64, cospi_5_64);
-          const __m256i k32_p11_p21 =
-              pair256_set_epi32(cospi_11_64, cospi_21_64);
-          const __m256i k32_p19_p13 =
-              pair256_set_epi32(cospi_19_64, cospi_13_64);
-          const __m256i k32_p03_p29 =
-              pair256_set_epi32(cospi_3_64, cospi_29_64);
-          const __m256i k32_m29_p03 =
-              pair256_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m256i k32_m13_p19 =
-              pair256_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m256i k32_m21_p11 =
-              pair256_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m256i k32_m05_p27 =
-              pair256_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27);
-
-          u[0] = k_packs_epi64_avx2(v[0], v[1]);
-          u[1] = k_packs_epi64_avx2(v[2], v[3]);
-          u[2] = k_packs_epi64_avx2(v[4], v[5]);
-          u[3] = k_packs_epi64_avx2(v[6], v[7]);
-          u[4] = k_packs_epi64_avx2(v[8], v[9]);
-          u[5] = k_packs_epi64_avx2(v[10], v[11]);
-          u[6] = k_packs_epi64_avx2(v[12], v[13]);
-          u[7] = k_packs_epi64_avx2(v[14], v[15]);
-          u[8] = k_packs_epi64_avx2(v[16], v[17]);
-          u[9] = k_packs_epi64_avx2(v[18], v[19]);
-          u[10] = k_packs_epi64_avx2(v[20], v[21]);
-          u[11] = k_packs_epi64_avx2(v[22], v[23]);
-          u[12] = k_packs_epi64_avx2(v[24], v[25]);
-          u[13] = k_packs_epi64_avx2(v[26], v[27]);
-          u[14] = k_packs_epi64_avx2(v[28], v[29]);
-          u[15] = k_packs_epi64_avx2(v[30], v[31]);
-
-          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm256_cmpgt_epi32(kZero, u[0]);
-          v[1] = _mm256_cmpgt_epi32(kZero, u[1]);
-          v[2] = _mm256_cmpgt_epi32(kZero, u[2]);
-          v[3] = _mm256_cmpgt_epi32(kZero, u[3]);
-          v[4] = _mm256_cmpgt_epi32(kZero, u[4]);
-          v[5] = _mm256_cmpgt_epi32(kZero, u[5]);
-          v[6] = _mm256_cmpgt_epi32(kZero, u[6]);
-          v[7] = _mm256_cmpgt_epi32(kZero, u[7]);
-          v[8] = _mm256_cmpgt_epi32(kZero, u[8]);
-          v[9] = _mm256_cmpgt_epi32(kZero, u[9]);
-          v[10] = _mm256_cmpgt_epi32(kZero, u[10]);
-          v[11] = _mm256_cmpgt_epi32(kZero, u[11]);
-          v[12] = _mm256_cmpgt_epi32(kZero, u[12]);
-          v[13] = _mm256_cmpgt_epi32(kZero, u[13]);
-          v[14] = _mm256_cmpgt_epi32(kZero, u[14]);
-          v[15] = _mm256_cmpgt_epi32(kZero, u[15]);
-
-          u[0] = _mm256_sub_epi32(u[0], v[0]);
-          u[1] = _mm256_sub_epi32(u[1], v[1]);
-          u[2] = _mm256_sub_epi32(u[2], v[2]);
-          u[3] = _mm256_sub_epi32(u[3], v[3]);
-          u[4] = _mm256_sub_epi32(u[4], v[4]);
-          u[5] = _mm256_sub_epi32(u[5], v[5]);
-          u[6] = _mm256_sub_epi32(u[6], v[6]);
-          u[7] = _mm256_sub_epi32(u[7], v[7]);
-          u[8] = _mm256_sub_epi32(u[8], v[8]);
-          u[9] = _mm256_sub_epi32(u[9], v[9]);
-          u[10] = _mm256_sub_epi32(u[10], v[10]);
-          u[11] = _mm256_sub_epi32(u[11], v[11]);
-          u[12] = _mm256_sub_epi32(u[12], v[12]);
-          u[13] = _mm256_sub_epi32(u[13], v[13]);
-          u[14] = _mm256_sub_epi32(u[14], v[14]);
-          u[15] = _mm256_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm256_add_epi32(u[0], K32One);
-          v[1] = _mm256_add_epi32(u[1], K32One);
-          v[2] = _mm256_add_epi32(u[2], K32One);
-          v[3] = _mm256_add_epi32(u[3], K32One);
-          v[4] = _mm256_add_epi32(u[4], K32One);
-          v[5] = _mm256_add_epi32(u[5], K32One);
-          v[6] = _mm256_add_epi32(u[6], K32One);
-          v[7] = _mm256_add_epi32(u[7], K32One);
-          v[8] = _mm256_add_epi32(u[8], K32One);
-          v[9] = _mm256_add_epi32(u[9], K32One);
-          v[10] = _mm256_add_epi32(u[10], K32One);
-          v[11] = _mm256_add_epi32(u[11], K32One);
-          v[12] = _mm256_add_epi32(u[12], K32One);
-          v[13] = _mm256_add_epi32(u[13], K32One);
-          v[14] = _mm256_add_epi32(u[14], K32One);
-          v[15] = _mm256_add_epi32(u[15], K32One);
-
-          u[0] = _mm256_srai_epi32(v[0], 2);
-          u[1] = _mm256_srai_epi32(v[1], 2);
-          u[2] = _mm256_srai_epi32(v[2], 2);
-          u[3] = _mm256_srai_epi32(v[3], 2);
-          u[4] = _mm256_srai_epi32(v[4], 2);
-          u[5] = _mm256_srai_epi32(v[5], 2);
-          u[6] = _mm256_srai_epi32(v[6], 2);
-          u[7] = _mm256_srai_epi32(v[7], 2);
-          u[8] = _mm256_srai_epi32(v[8], 2);
-          u[9] = _mm256_srai_epi32(v[9], 2);
-          u[10] = _mm256_srai_epi32(v[10], 2);
-          u[11] = _mm256_srai_epi32(v[11], 2);
-          u[12] = _mm256_srai_epi32(v[12], 2);
-          u[13] = _mm256_srai_epi32(v[13], 2);
-          u[14] = _mm256_srai_epi32(v[14], 2);
-          u[15] = _mm256_srai_epi32(v[15], 2);
-
-          out[5] = _mm256_packs_epi32(u[0], u[1]);
-          out[21] = _mm256_packs_epi32(u[2], u[3]);
-          out[13] = _mm256_packs_epi32(u[4], u[5]);
-          out[29] = _mm256_packs_epi32(u[6], u[7]);
-          out[3] = _mm256_packs_epi32(u[8], u[9]);
-          out[19] = _mm256_packs_epi32(u[10], u[11]);
-          out[11] = _mm256_packs_epi32(u[12], u[13]);
-          out[27] = _mm256_packs_epi32(u[14], u[15]);
-        }
-      }
-#endif
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output_currStep, *output_nextStep;
-        tran_low_t *curr_out, *next_out;
-        // Pass 0
-        output_currStep = &intermediate[column_start * 32];
-        output_nextStep = &intermediate[(column_start + 8) * 32];
-        // Pass 1
-        curr_out = &output_org[column_start * 32];
-        next_out = &output_org[(column_start + 8) * 32];
-
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m256i *this_out = &out[8 * transpose_block];
-          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
-          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
-          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
-          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
-          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
-          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
-          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
-          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
-          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
-          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
-          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
-          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
-          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
-          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
-          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
-          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
-
-          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
-          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
-          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
-          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
-          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
-          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
-          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
-          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
-          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
-          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
-          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
-          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
-          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
-          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
-          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
-          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
-          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
-          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
-          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
-          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
-          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
-          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
-          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
-            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
-            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
-            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
-            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
-            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
-            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
-            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/av1_dct.c
-            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
-          }
-          if (0 == pass) {
-            // Note: even though all these stores are aligned, using the aligned
-            //       intrinsic make the code slightly slower.
-            _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
-                             _mm256_castsi256_si128(tr2_0));
-            _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
-                             _mm256_castsi256_si128(tr2_1));
-            _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
-                             _mm256_castsi256_si128(tr2_2));
-            _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
-                             _mm256_castsi256_si128(tr2_3));
-            _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
-                             _mm256_castsi256_si128(tr2_4));
-            _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
-                             _mm256_castsi256_si128(tr2_5));
-            _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
-                             _mm256_castsi256_si128(tr2_6));
-            _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
-                             _mm256_castsi256_si128(tr2_7));
-
-            _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
-                             _mm256_extractf128_si256(tr2_0, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
-                             _mm256_extractf128_si256(tr2_1, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
-                             _mm256_extractf128_si256(tr2_2, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
-                             _mm256_extractf128_si256(tr2_3, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
-                             _mm256_extractf128_si256(tr2_4, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
-                             _mm256_extractf128_si256(tr2_5, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
-                             _mm256_extractf128_si256(tr2_6, 1));
-            _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
-                             _mm256_extractf128_si256(tr2_7, 1));
-            // Process next 8x8
-            output_currStep += 8;
-            output_nextStep += 8;
-          }
-          if (1 == pass) {
-            store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
-            store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
-            store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
-            store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
-            store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
-            store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
-            store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
-            store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
-            curr_out += 8;
-            next_out += 8;
-          }
-        }
-      }
-    }
-  }
-  _mm256_zeroupper();
-}  // NOLINT
diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 69dd6af11..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ /dev/null
@@ -1,3201 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c
-#else
-void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    aom_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c
-#endif  // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif  // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 8) {
-      __m128i step1[32];
-      __m128i step2[32];
-      __m128i step3[32];
-      __m128i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m128i *step1a = &step1[0];
-          __m128i *step1b = &step1[31];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m128i *step1a = &step1[4];
-          __m128i *step1b = &step1[27];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m128i *step1a = &step1[8];
-          __m128i *step1b = &step1[23];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m128i *step1a = &step1[12];
-          __m128i *step1b = &step1[19];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
-          __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
-          __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
-          __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
-          __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[0] = ADD_EPI16(in00, in31);
-          step1[1] = ADD_EPI16(in01, in30);
-          step1[2] = ADD_EPI16(in02, in29);
-          step1[3] = ADD_EPI16(in03, in28);
-          step1[28] = SUB_EPI16(in03, in28);
-          step1[29] = SUB_EPI16(in02, in29);
-          step1[30] = SUB_EPI16(in01, in30);
-          step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
-                                             &step1[3], &step1[28], &step1[29],
-                                             &step1[30], &step1[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
-          __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
-          __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
-          __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
-          __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[4] = ADD_EPI16(in04, in27);
-          step1[5] = ADD_EPI16(in05, in26);
-          step1[6] = ADD_EPI16(in06, in25);
-          step1[7] = ADD_EPI16(in07, in24);
-          step1[24] = SUB_EPI16(in07, in24);
-          step1[25] = SUB_EPI16(in06, in25);
-          step1[26] = SUB_EPI16(in05, in26);
-          step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
-                                             &step1[7], &step1[24], &step1[25],
-                                             &step1[26], &step1[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
-          __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
-          __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[8] = ADD_EPI16(in08, in23);
-          step1[9] = ADD_EPI16(in09, in22);
-          step1[10] = ADD_EPI16(in10, in21);
-          step1[11] = ADD_EPI16(in11, in20);
-          step1[20] = SUB_EPI16(in11, in20);
-          step1[21] = SUB_EPI16(in10, in21);
-          step1[22] = SUB_EPI16(in09, in22);
-          step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
-                                             &step1[11], &step1[20], &step1[21],
-                                             &step1[22], &step1[23]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = ADD_EPI16(in12, in19);
-          step1[13] = ADD_EPI16(in13, in18);
-          step1[14] = ADD_EPI16(in14, in17);
-          step1[15] = ADD_EPI16(in15, in16);
-          step1[16] = SUB_EPI16(in15, in16);
-          step1[17] = SUB_EPI16(in14, in17);
-          step1[18] = SUB_EPI16(in13, in18);
-          step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
-                                             &step1[15], &step1[16], &step1[17],
-                                             &step1[18], &step1[19]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = ADD_EPI16(step1[0], step1[15]);
-        step2[1] = ADD_EPI16(step1[1], step1[14]);
-        step2[2] = ADD_EPI16(step1[2], step1[13]);
-        step2[3] = ADD_EPI16(step1[3], step1[12]);
-        step2[4] = ADD_EPI16(step1[4], step1[11]);
-        step2[5] = ADD_EPI16(step1[5], step1[10]);
-        step2[6] = ADD_EPI16(step1[6], step1[9]);
-        step2[7] = ADD_EPI16(step1[7], step1[8]);
-        step2[8] = SUB_EPI16(step1[7], step1[8]);
-        step2[9] = SUB_EPI16(step1[6], step1[9]);
-        step2[10] = SUB_EPI16(step1[5], step1[10]);
-        step2[11] = SUB_EPI16(step1[4], step1[11]);
-        step2[12] = SUB_EPI16(step1[3], step1[12]);
-        step2[13] = SUB_EPI16(step1[2], step1[13]);
-        step2[14] = SUB_EPI16(step1[1], step1[14]);
-        step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
-                                           &step2[23], &step2[24], &step2[25],
-                                           &step2[26], &step2[27]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
-        step2[0] = SUB_EPI16(step2[0], s3_00_0);
-        step2[1] = SUB_EPI16(step2[1], s3_01_0);
-        step2[2] = SUB_EPI16(step2[2], s3_02_0);
-        step2[3] = SUB_EPI16(step2[3], s3_03_0);
-        step2[4] = SUB_EPI16(step2[4], s3_04_0);
-        step2[5] = SUB_EPI16(step2[5], s3_05_0);
-        step2[6] = SUB_EPI16(step2[6], s3_06_0);
-        step2[7] = SUB_EPI16(step2[7], s3_07_0);
-        step2[8] = SUB_EPI16(step2[8], s2_08_0);
-        step2[9] = SUB_EPI16(step2[9], s2_09_0);
-        step2[10] = SUB_EPI16(step2[10], s3_10_0);
-        step2[11] = SUB_EPI16(step2[11], s3_11_0);
-        step2[12] = SUB_EPI16(step2[12], s3_12_0);
-        step2[13] = SUB_EPI16(step2[13], s3_13_0);
-        step2[14] = SUB_EPI16(step2[14], s2_14_0);
-        step2[15] = SUB_EPI16(step2[15], s2_15_0);
-        step1[16] = SUB_EPI16(step1[16], s3_16_0);
-        step1[17] = SUB_EPI16(step1[17], s3_17_0);
-        step1[18] = SUB_EPI16(step1[18], s3_18_0);
-        step1[19] = SUB_EPI16(step1[19], s3_19_0);
-        step2[20] = SUB_EPI16(step2[20], s3_20_0);
-        step2[21] = SUB_EPI16(step2[21], s3_21_0);
-        step2[22] = SUB_EPI16(step2[22], s3_22_0);
-        step2[23] = SUB_EPI16(step2[23], s3_23_0);
-        step2[24] = SUB_EPI16(step2[24], s3_24_0);
-        step2[25] = SUB_EPI16(step2[25], s3_25_0);
-        step2[26] = SUB_EPI16(step2[26], s3_26_0);
-        step2[27] = SUB_EPI16(step2[27], s3_27_0);
-        step1[28] = SUB_EPI16(step1[28], s3_28_0);
-        step1[29] = SUB_EPI16(step1[29], s3_29_0);
-        step1[30] = SUB_EPI16(step1[30], s3_30_0);
-        step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x32(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
-            &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
-            &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
-            &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
-        if (overflow) {
-          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        step2[0] = _mm_add_epi16(step2[0], kOne);
-        step2[1] = _mm_add_epi16(step2[1], kOne);
-        step2[2] = _mm_add_epi16(step2[2], kOne);
-        step2[3] = _mm_add_epi16(step2[3], kOne);
-        step2[4] = _mm_add_epi16(step2[4], kOne);
-        step2[5] = _mm_add_epi16(step2[5], kOne);
-        step2[6] = _mm_add_epi16(step2[6], kOne);
-        step2[7] = _mm_add_epi16(step2[7], kOne);
-        step2[8] = _mm_add_epi16(step2[8], kOne);
-        step2[9] = _mm_add_epi16(step2[9], kOne);
-        step2[10] = _mm_add_epi16(step2[10], kOne);
-        step2[11] = _mm_add_epi16(step2[11], kOne);
-        step2[12] = _mm_add_epi16(step2[12], kOne);
-        step2[13] = _mm_add_epi16(step2[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step1[16] = _mm_add_epi16(step1[16], kOne);
-        step1[17] = _mm_add_epi16(step1[17], kOne);
-        step1[18] = _mm_add_epi16(step1[18], kOne);
-        step1[19] = _mm_add_epi16(step1[19], kOne);
-        step2[20] = _mm_add_epi16(step2[20], kOne);
-        step2[21] = _mm_add_epi16(step2[21], kOne);
-        step2[22] = _mm_add_epi16(step2[22], kOne);
-        step2[23] = _mm_add_epi16(step2[23], kOne);
-        step2[24] = _mm_add_epi16(step2[24], kOne);
-        step2[25] = _mm_add_epi16(step2[25], kOne);
-        step2[26] = _mm_add_epi16(step2[26], kOne);
-        step2[27] = _mm_add_epi16(step2[27], kOne);
-        step1[28] = _mm_add_epi16(step1[28], kOne);
-        step1[29] = _mm_add_epi16(step1[29], kOne);
-        step1[30] = _mm_add_epi16(step1[30], kOne);
-        step1[31] = _mm_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm_srai_epi16(step2[0], 2);
-        step2[1] = _mm_srai_epi16(step2[1], 2);
-        step2[2] = _mm_srai_epi16(step2[2], 2);
-        step2[3] = _mm_srai_epi16(step2[3], 2);
-        step2[4] = _mm_srai_epi16(step2[4], 2);
-        step2[5] = _mm_srai_epi16(step2[5], 2);
-        step2[6] = _mm_srai_epi16(step2[6], 2);
-        step2[7] = _mm_srai_epi16(step2[7], 2);
-        step2[8] = _mm_srai_epi16(step2[8], 2);
-        step2[9] = _mm_srai_epi16(step2[9], 2);
-        step2[10] = _mm_srai_epi16(step2[10], 2);
-        step2[11] = _mm_srai_epi16(step2[11], 2);
-        step2[12] = _mm_srai_epi16(step2[12], 2);
-        step2[13] = _mm_srai_epi16(step2[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step1[16] = _mm_srai_epi16(step1[16], 2);
-        step1[17] = _mm_srai_epi16(step1[17], 2);
-        step1[18] = _mm_srai_epi16(step1[18], 2);
-        step1[19] = _mm_srai_epi16(step1[19], 2);
-        step2[20] = _mm_srai_epi16(step2[20], 2);
-        step2[21] = _mm_srai_epi16(step2[21], 2);
-        step2[22] = _mm_srai_epi16(step2[22], 2);
-        step2[23] = _mm_srai_epi16(step2[23], 2);
-        step2[24] = _mm_srai_epi16(step2[24], 2);
-        step2[25] = _mm_srai_epi16(step2[25], 2);
-        step2[26] = _mm_srai_epi16(step2[26], 2);
-        step2[27] = _mm_srai_epi16(step2[27], 2);
-        step1[28] = _mm_srai_epi16(step1[28], 2);
-        step1[29] = _mm_srai_epi16(step1[29], 2);
-        step1[30] = _mm_srai_epi16(step1[30], 2);
-        step1[31] = _mm_srai_epi16(step1[31], 2);
-      }
-#endif  // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
-          step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
-          step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
-          step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
-          step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
-          step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
-          step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
-          step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
-                                             &step3[3], &step3[4], &step3[5],
-                                             &step3[6], &step3[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
-                                             &step3[13]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[16] = ADD_EPI16(step2[23], step1[16]);
-          step3[17] = ADD_EPI16(step2[22], step1[17]);
-          step3[18] = ADD_EPI16(step2[21], step1[18]);
-          step3[19] = ADD_EPI16(step2[20], step1[19]);
-          step3[20] = SUB_EPI16(step1[19], step2[20]);
-          step3[21] = SUB_EPI16(step1[18], step2[21]);
-          step3[22] = SUB_EPI16(step1[17], step2[22]);
-          step3[23] = SUB_EPI16(step1[16], step2[23]);
-          step3[24] = SUB_EPI16(step1[31], step2[24]);
-          step3[25] = SUB_EPI16(step1[30], step2[25]);
-          step3[26] = SUB_EPI16(step1[29], step2[26]);
-          step3[27] = SUB_EPI16(step1[28], step2[27]);
-          step3[28] = ADD_EPI16(step2[27], step1[28]);
-          step3[29] = ADD_EPI16(step2[26], step1[29]);
-          step3[30] = ADD_EPI16(step2[25], step1[30]);
-          step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
-              &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
-              &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
-              &step3[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-
-        // Stage 4
-        {
-          step1[0] = ADD_EPI16(step3[3], step3[0]);
-          step1[1] = ADD_EPI16(step3[2], step3[1]);
-          step1[2] = SUB_EPI16(step3[1], step3[2]);
-          step1[3] = SUB_EPI16(step3[0], step3[3]);
-          step1[8] = ADD_EPI16(step3[11], step2[8]);
-          step1[9] = ADD_EPI16(step3[10], step2[9]);
-          step1[10] = SUB_EPI16(step2[9], step3[10]);
-          step1[11] = SUB_EPI16(step2[8], step3[11]);
-          step1[12] = SUB_EPI16(step2[15], step3[12]);
-          step1[13] = SUB_EPI16(step2[14], step3[13]);
-          step1[14] = ADD_EPI16(step3[13], step2[14]);
-          step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
-              &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
-              &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
-                                             &step1[21], &step1[26], &step1[27],
-                                             &step1[28], &step1[29]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 5
-        {
-          step2[4] = ADD_EPI16(step1[5], step3[4]);
-          step2[5] = SUB_EPI16(step3[4], step1[5]);
-          step2[6] = SUB_EPI16(step3[7], step1[6]);
-          step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
-                                             &step2[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-          const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-          const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-          const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-          const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i out_00_4 =
-              _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_5 =
-              _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_4 =
-              _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_5 =
-              _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_4 =
-              _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_5 =
-              _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_4 =
-              _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_5 =
-              _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-          const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-          const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-          const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-          const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
-                                             &step2[14]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step2[16] = ADD_EPI16(step1[19], step3[16]);
-          step2[17] = ADD_EPI16(step1[18], step3[17]);
-          step2[18] = SUB_EPI16(step3[17], step1[18]);
-          step2[19] = SUB_EPI16(step3[16], step1[19]);
-          step2[20] = SUB_EPI16(step3[23], step1[20]);
-          step2[21] = SUB_EPI16(step3[22], step1[21]);
-          step2[22] = ADD_EPI16(step1[21], step3[22]);
-          step2[23] = ADD_EPI16(step1[20], step3[23]);
-          step2[24] = ADD_EPI16(step1[27], step3[24]);
-          step2[25] = ADD_EPI16(step1[26], step3[25]);
-          step2[26] = SUB_EPI16(step3[25], step1[26]);
-          step2[27] = SUB_EPI16(step3[24], step1[27]);
-          step2[28] = SUB_EPI16(step3[31], step1[28]);
-          step2[29] = SUB_EPI16(step3[30], step1[29]);
-          step2[30] = ADD_EPI16(step1[29], step3[30]);
-          step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
-              &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
-              &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
-              &step2[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 6
-        {
-          const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m128i out_04_4 =
-              _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_5 =
-              _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_4 =
-              _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_5 =
-              _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_4 =
-              _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_5 =
-              _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_4 =
-              _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_5 =
-              _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[8] = ADD_EPI16(step2[9], step1[8]);
-          step3[9] = SUB_EPI16(step1[8], step2[9]);
-          step3[10] = SUB_EPI16(step1[11], step2[10]);
-          step3[11] = ADD_EPI16(step2[10], step1[11]);
-          step3[12] = ADD_EPI16(step2[13], step1[12]);
-          step3[13] = SUB_EPI16(step1[12], step2[13]);
-          step3[14] = SUB_EPI16(step1[15], step2[14]);
-          step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
-                                             &step3[11], &step3[12], &step3[13],
-                                             &step3[14], &step3[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-          const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-          const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-          const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-          const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-          const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-          const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-          const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-          const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
-                                             &step3[22], &step3[25], &step3[26],
-                                             &step3[29], &step3[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 7
-        {
-          const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-          const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-          const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-          const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-          const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-          const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-          const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-          const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-          const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m128i out_02_4 =
-              _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_5 =
-              _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_4 =
-              _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_5 =
-              _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_4 =
-              _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_5 =
-              _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_4 =
-              _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_5 =
-              _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_4 =
-              _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_5 =
-              _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_4 =
-              _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_5 =
-              _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_4 =
-              _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_5 =
-              _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_4 =
-              _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_5 =
-              _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step1[16] = ADD_EPI16(step3[17], step2[16]);
-          step1[17] = SUB_EPI16(step2[16], step3[17]);
-          step1[18] = SUB_EPI16(step2[19], step3[18]);
-          step1[19] = ADD_EPI16(step3[18], step2[19]);
-          step1[20] = ADD_EPI16(step3[21], step2[20]);
-          step1[21] = SUB_EPI16(step2[20], step3[21]);
-          step1[22] = SUB_EPI16(step2[23], step3[22]);
-          step1[23] = ADD_EPI16(step3[22], step2[23]);
-          step1[24] = ADD_EPI16(step3[25], step2[24]);
-          step1[25] = SUB_EPI16(step2[24], step3[25]);
-          step1[26] = SUB_EPI16(step2[27], step3[26]);
-          step1[27] = ADD_EPI16(step3[26], step2[27]);
-          step1[28] = ADD_EPI16(step3[29], step2[28]);
-          step1[29] = SUB_EPI16(step2[28], step3[29]);
-          step1[30] = SUB_EPI16(step2[31], step3[30]);
-          step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
-              &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
-              &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
-              &step1[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-          const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-          const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-          const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-          const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-          const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-          const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-          const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-          const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m128i out_01_4 =
-              _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_5 =
-              _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_4 =
-              _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_5 =
-              _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_4 =
-              _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_5 =
-              _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_4 =
-              _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_5 =
-              _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_4 =
-              _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_5 =
-              _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_4 =
-              _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_5 =
-              _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_4 =
-              _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_5 =
-              _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_4 =
-              _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_5 =
-              _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-          const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-          const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-          const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-          const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-          const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-          const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-          const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-          const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m128i out_05_4 =
-              _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_5 =
-              _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_4 =
-              _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_5 =
-              _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_4 =
-              _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_5 =
-              _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_4 =
-              _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_5 =
-              _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_4 =
-              _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_5 =
-              _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_4 =
-              _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_5 =
-              _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_4 =
-              _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_5 =
-              _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_4 =
-              _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_5 =
-              _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m128i lstep1[64], lstep2[64], lstep3[64];
-        __m128i u[32], v[32], sign[16];
-        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
-                                              &v[5], &v[6], &v[7], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm_packs_epi32(u[0], u[1]);
-          out[16] = _mm_packs_epi32(u[2], u[3]);
-          out[8] = _mm_packs_epi32(u[4], u[5]);
-          out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          out[4] = _mm_packs_epi32(u[0], u[1]);
-          out[20] = _mm_packs_epi32(u[2], u[3]);
-          out[12] = _mm_packs_epi32(u[4], u[5]);
-          out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m12_m20 =
-              pair_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
-          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
-          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
-          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
-          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[2] = _mm_packs_epi32(u[0], u[1]);
-          out[18] = _mm_packs_epi32(u[2], u[3]);
-          out[10] = _mm_packs_epi32(u[4], u[5]);
-          out[26] = _mm_packs_epi32(u[6], u[7]);
-          out[6] = _mm_packs_epi32(u[8], u[9]);
-          out[22] = _mm_packs_epi32(u[10], u[11]);
-          out[14] = _mm_packs_epi32(u[12], u[13]);
-          out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
-          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
-          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
-          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
-          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[1] = _mm_packs_epi32(u[0], u[1]);
-          out[17] = _mm_packs_epi32(u[2], u[3]);
-          out[9] = _mm_packs_epi32(u[4], u[5]);
-          out[25] = _mm_packs_epi32(u[6], u[7]);
-          out[7] = _mm_packs_epi32(u[8], u[9]);
-          out[23] = _mm_packs_epi32(u[10], u[11]);
-          out[15] = _mm_packs_epi32(u[12], u[13]);
-          out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
-          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
-          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
-          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
-          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[5] = _mm_packs_epi32(u[0], u[1]);
-          out[21] = _mm_packs_epi32(u[2], u[3]);
-          out[13] = _mm_packs_epi32(u[4], u[5]);
-          out[29] = _mm_packs_epi32(u[6], u[7]);
-          out[3] = _mm_packs_epi32(u[8], u[9]);
-          out[19] = _mm_packs_epi32(u[10], u[11]);
-          out[11] = _mm_packs_epi32(u[12], u[13]);
-          out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-#endif  // FDCT32x32_HIGH_PRECISION
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output0 = &intermediate[column_start * 32];
-        tran_low_t *output1 = &output_org[column_start * 32];
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m128i *this_out = &out[8 * transpose_block];
-          // 00 01 02 03 04 05 06 07
-          // 10 11 12 13 14 15 16 17
-          // 20 21 22 23 24 25 26 27
-          // 30 31 32 33 34 35 36 37
-          // 40 41 42 43 44 45 46 47
-          // 50 51 52 53 54 55 56 57
-          // 60 61 62 63 64 65 66 67
-          // 70 71 72 73 74 75 76 77
-          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00 10 01 11 02 12 03 13
-          // 20 30 21 31 22 32 23 33
-          // 04 14 05 15 06 16 07 17
-          // 24 34 25 35 26 36 27 37
-          // 40 50 41 51 42 52 43 53
-          // 60 70 61 71 62 72 63 73
-          // 54 54 55 55 56 56 57 57
-          // 64 74 65 75 66 76 67 77
-          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 10 20 30 01 11 21 31
-          // 40 50 60 70 41 51 61 71
-          // 02 12 22 32 03 13 23 33
-          // 42 52 62 72 43 53 63 73
-          // 04 14 24 34 05 15 21 36
-          // 44 54 64 74 45 55 61 76
-          // 06 16 26 36 07 17 27 37
-          // 46 56 66 76 47 57 67 77
-          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 10 20 30 40 50 60 70
-          // 01 11 21 31 41 51 61 71
-          // 02 12 22 32 42 52 62 72
-          // 03 13 23 33 43 53 63 73
-          // 04 14 24 34 44 54 64 74
-          // 05 15 25 35 45 55 65 75
-          // 06 16 26 36 46 56 66 76
-          // 07 17 27 37 47 57 67 77
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
-            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
-            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
-            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
-            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
-            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
-            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
-            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/av1_dct.c
-            tr2_0 = _mm_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm_srai_epi16(tr2_7, 2);
-          }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          if (pass == 0) {
-            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
-            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
-            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
-            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
-            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
-            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
-            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
-            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
-            // Process next 8x8
-            output0 += 8;
-          } else {
-            storeu_output(&tr2_0, (output1 + 0 * 32));
-            storeu_output(&tr2_1, (output1 + 1 * 32));
-            storeu_output(&tr2_2, (output1 + 2 * 32));
-            storeu_output(&tr2_3, (output1 + 3 * 32));
-            storeu_output(&tr2_4, (output1 + 4 * 32));
-            storeu_output(&tr2_5, (output1 + 5 * 32));
-            storeu_output(&tr2_6, (output1 + 6 * 32));
-            storeu_output(&tr2_7, (output1 + 7 * 32));
-            // Process next 8x8
-            output1 += 8;
-          }
-        }
-      }
-    }
-  }
-}  // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
deleted file mode 100644
index 670f864d0..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
deleted file mode 100644
index 86df4a6f6..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
-#define AOM_DSP_X86_FWD_TXFM_AVX2_H
-
-#include "./aom_config.h"
-
-static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
-
-    __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
-    __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
-
-    __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
-    __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
-
-    _mm256_storeu_si256((__m256i *)out, y0);
-    _mm256_storeu_si256((__m256i *)(out + 8), y1);
-  } else {
-    _mm256_storeu_si256((__m256i *)out, *coeff);
-  }
-}
-
-#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 7bb1db70a..1e3d13ec8 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -11,7 +11,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
@@ -29,233 +30,6 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 =
-      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
-  __m128i cmp0, cmp1;
-  int test, overflow;
-#endif
-
-  // Load inputs.
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  // in0 = [i0 i1 i2 i3 iC iD iE iF]
-  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-#if DCT_HIGH_BIT_DEPTH
-  // Check inputs small enough to use optimised code
-  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
-  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
-  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
-  if (test) {
-    aom_highbd_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-  // multiply by 16 to give some extra precision
-  in0 = _mm_slli_epi16(in0, 4);
-  in1 = _mm_slli_epi16(in1, 4);
-  // if (i == 0 && input[0]) input[0] += 1;
-  // add 1 to the upper left pixel if it is non-zero, which helps reduce
-  // the round-trip error
-  {
-    // The mask will only contain whether the first value is zero, all
-    // other comparison will fail as something shifted by 4 (above << 4)
-    // can never be equal to one. To increment in the non-zero case, we
-    // add the mask and one for the first element:
-    //   - if zero, mask = -1, v = v - 1 + 1 = v
-    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-    in0 = _mm_add_epi16(in0, mask);
-    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&x0, &x1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-    const __m128i t0 = ADD_EPI16(in0, in1);
-    const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&t0, &t1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    {
-      // The constants needed here are:
-      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-      // Then add and right-shift to get back to 16-bit range
-      // but this combines the final right-shift as well to save operations
-      // This unusual rounding operations is to maintain bit-accurate
-      // compatibility with the c version of this function which has two
-      // rounding steps in a row.
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-      // w0 = [o0 o4 o8 oC]
-      // w1 = [o2 o6 oA oE]
-      // w2 = [o1 o5 o9 oD]
-      // w3 = [o3 o7 oB oF]
-      // remember the o's are numbered according to the correct output location
-      const __m128i x0 = _mm_packs_epi32(w0, w1);
-      const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&x0, &x1);
-      if (overflow) {
-        aom_highbd_fdct4x4_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-        // y1 = [o2 o3 o6 o7 oA oB oE oF]
-        in0 = _mm_unpacklo_epi32(y0, y1);
-        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-        in1 = _mm_unpackhi_epi32(y0, y1);
-        // in1 = [o8 o9 oA oB oC oD oE oF]
-      }
-    }
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  storeu_output(&in0, output + 0 * 4);
-  storeu_output(&in1, output + 2 * 4);
-}
-
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // Constants
@@ -566,449 +340,5 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   }
 }
 
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
-  const int16_t *in = input;
-  int16_t *out0 = intermediate;
-  tran_low_t *out1 = output;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-#if DCT_HIGH_BIT_DEPTH
-    int overflow;
-#endif
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = ADD_EPI16(in00, in15);
-        input1 = ADD_EPI16(in01, in14);
-        input2 = ADD_EPI16(in02, in13);
-        input3 = ADD_EPI16(in03, in12);
-        input4 = ADD_EPI16(in04, in11);
-        input5 = ADD_EPI16(in05, in10);
-        input6 = ADD_EPI16(in06, in09);
-        input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
-                                           &input4, &input5, &input6, &input7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = SUB_EPI16(in07, in08);
-        step1_1 = SUB_EPI16(in06, in09);
-        step1_2 = SUB_EPI16(in05, in10);
-        step1_3 = SUB_EPI16(in04, in11);
-        step1_4 = SUB_EPI16(in03, in12);
-        step1_5 = SUB_EPI16(in02, in13);
-        step1_6 = SUB_EPI16(in01, in14);
-        step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                    &step1_4, &step1_5, &step1_6, &step1_7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = ADD_EPI16(input0, input7);
-        const __m128i q1 = ADD_EPI16(input1, input6);
-        const __m128i q2 = ADD_EPI16(input2, input5);
-        const __m128i q3 = ADD_EPI16(input3, input4);
-        const __m128i q4 = SUB_EPI16(input3, input4);
-        const __m128i q5 = SUB_EPI16(input2, input5);
-        const __m128i q6 = SUB_EPI16(input1, input6);
-        const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = ADD_EPI16(q0, q3);
-          const __m128i r1 = ADD_EPI16(q1, q2);
-          const __m128i r2 = SUB_EPI16(q1, q2);
-          const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          {
-            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-          }
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&r0, &r1);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          {
-            // Add/subtract
-            const __m128i x0 = ADD_EPI16(q4, r0);
-            const __m128i x1 = SUB_EPI16(q4, r0);
-            const __m128i x2 = SUB_EPI16(q7, r1);
-            const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-            // Interleave to do the multiply by constants which gets us
-            // into 32 bits.
-            {
-              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-              overflow =
-                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
-              if (overflow) {
-                aom_highbd_fdct16x16_c(input, output, stride);
-                return;
-              }
-#endif  // DCT_HIGH_BIT_DEPTH
-            }
-          }
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 3
-        {
-          step3_0 = ADD_EPI16(step1_0, step2_3);
-          step3_1 = ADD_EPI16(step1_1, step2_2);
-          step3_2 = SUB_EPI16(step1_1, step2_2);
-          step3_3 = SUB_EPI16(step1_0, step2_3);
-          step3_4 = SUB_EPI16(step1_7, step2_4);
-          step3_5 = SUB_EPI16(step1_6, step2_5);
-          step3_6 = ADD_EPI16(step1_6, step2_5);
-          step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
-                                      &step3_4, &step3_5, &step3_6, &step3_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 5
-        {
-          step1_0 = ADD_EPI16(step3_0, step2_1);
-          step1_1 = SUB_EPI16(step3_0, step2_1);
-          step1_2 = ADD_EPI16(step3_3, step2_2);
-          step1_3 = SUB_EPI16(step3_3, step2_2);
-          step1_4 = SUB_EPI16(step3_4, step2_5);
-          step1_5 = ADD_EPI16(step3_4, step2_5);
-          step1_6 = SUB_EPI16(step3_7, step2_6);
-          step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                      &step1_4, &step1_5, &step1_6, &step1_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
-                              &res06, &res07, pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
-                              &res14, &res15, pass, out0 + 8, out1 + 8);
-      if (pass == 0) {
-        out0 += 8 * 16;
-      } else {
-        out1 += 8 * 16;
-      }
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-  }
-}
-
 #undef ADD_EPI16
 #undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
index 657dcfa22..2d8f8f71e 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -11,40 +11,12 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
-void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0, in1;
-  __m128i tmp;
-  const __m128i zero = _mm_setzero_si128();
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
-  tmp = _mm_add_epi16(in0, in1);
-  in0 = _mm_unpacklo_epi16(zero, tmp);
-  in1 = _mm_unpackhi_epi16(zero, tmp);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(tmp, zero);
-  in1 = _mm_unpackhi_epi32(tmp, zero);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(tmp, 8);
-
-  in1 = _mm_add_epi32(tmp, in0);
-  in0 = _mm_slli_epi32(in1, 1);
-  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
-}
-
 void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
@@ -86,47 +58,12 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
 }
 
 #define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D aom_fdct4x4_sse2
 #define FDCT8x8_2D aom_fdct8x8_sse2
-#define FDCT16x16_2D aom_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
 
-#define FDCT32x32_2D aom_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
 #undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_HIGHBITDEPTH
 #define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D aom_highbd_fdct4x4_sse2
 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#define FDCT16x16_2D aom_highbd_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
index 58e8971dd..12ccf7f26 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -12,15 +12,10 @@
 #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
 #define AOM_DSP_X86_FWD_TXFM_SSE2_H_
 
-#include "aom_dsp/x86/txfm_common_intrin.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
-
 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
   __m128i buf0, buf1;
   buf0 = _mm_mul_epu32(a, b);
@@ -140,112 +135,6 @@ static INLINE int check_epi16_overflow_x32(
   return res0 + res1;
 }
 
-static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
-                                           const __m128i *preg1,
-                                           const __m128i *preg2,
-                                           const __m128i *preg3,
-                                           const __m128i *zero) {
-  __m128i minus_one = _mm_set1_epi32(-1);
-  // Check for overflows
-  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
-  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
-  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
-  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
-  __m128i reg0_top_dwords =
-      _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg1_top_dwords =
-      _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg2_top_dwords =
-      _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i reg3_top_dwords =
-      _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
-  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
-  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
-  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
-  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
-  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
-  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
-  int overflow_01 =
-      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
-  int overflow_23 =
-      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
-  return (overflow_01 + overflow_23);
-}
-
-static INLINE int k_check_epi32_overflow_8(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-  }
-  return overflow;
-}
-
-static INLINE int k_check_epi32_overflow_16(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-    if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
-      if (!overflow) {
-        overflow =
-            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
-      }
-    }
-  }
-  return overflow;
-}
-
-static INLINE int k_check_epi32_overflow_32(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
-    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
-    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
-    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
-    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
-    const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
-  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
-  if (!overflow) {
-    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
-    if (!overflow) {
-      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
-      if (!overflow) {
-        overflow =
-            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
-        if (!overflow) {
-          overflow =
-              k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
-          if (!overflow) {
-            overflow =
-                k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
-            if (!overflow) {
-              overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
-                                                  preg27, zero);
-              if (!overflow) {
-                overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
-                                                    preg31, zero);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return overflow;
-}
-
 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   if (sizeof(tran_low_t) == 4) {
     const __m128i zero = _mm_setzero_si128();
@@ -259,102 +148,6 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
   }
 }
 
-static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
-                                       const __m128i *pmultiplier,
-                                       const __m128i *prounding, int shift) {
-  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
-  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
-  const __m128i v0 = _mm_add_epi32(u0, *prounding);
-  const __m128i v1 = _mm_add_epi32(u1, *prounding);
-  const __m128i w0 = _mm_srai_epi32(v0, shift);
-  const __m128i w1 = _mm_srai_epi32(v1, shift);
-  return _mm_packs_epi32(w0, w1);
-}
-
-static INLINE void transpose_and_output8x8(
-    const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
-    const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
-    const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr,
-    tran_low_t *out1_ptr) {
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  // 40 41 42 43 44 45 46 47
-  // 50 51 52 53 54 55 56 57
-  // 60 61 62 63 64 65 66 67
-  // 70 71 72 73 74 75 76 77
-  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 54 54 55 55 56 56 57 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 21 36
-  // 44 54 64 74 45 55 61 76
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-  if (pass == 0) {
-    _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
-    _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
-  } else {
-    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
-    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
-    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
-    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
-    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
-    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
-    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
-    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
-  }
-}
-
-void fdct32_8col(__m128i *in0, __m128i *in1);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 8fa1c04d0..c1fb259a1 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -13,10 +13,6 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
 SECTION_RODATA
 
 pw_11585x2: times 8 dw 23170
@@ -32,106 +28,7 @@ TRANSFORM_COEFFS 15137,   6270
 TRANSFORM_COEFFS 16069,   3196
 TRANSFORM_COEFFS  9102,  13623
 
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
 %macro STORE_OUTPUT 2 ; index, result
-%if CONFIG_HIGHBITDEPTH
   ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
   ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
   ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
@@ -144,16 +41,16 @@ SECTION .text
   punpckhwd          m12, m11
   mova               [outputq + 4*%1 +  0], m%2
   mova               [outputq + 4*%1 + 16], m12
-%else
-  mova               [outputq + 2*%1], m%2
-%endif
 %endmacro
 
+SECTION .text
+
+%if ARCH_X86_64
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -180,25 +77,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D  0
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
-
-  STORE_OUTPUT       0, 0
-  STORE_OUTPUT       8, 1
-  STORE_OUTPUT      16, 2
-  STORE_OUTPUT      24, 3
-  STORE_OUTPUT      32, 4
-  STORE_OUTPUT      40, 5
-  STORE_OUTPUT      48, 6
-  STORE_OUTPUT      56, 7
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  paddw m9, m1, m6
+  psubw m1, m6
+
+  paddw m7, m2, m5
+  psubw m2, m5
+
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  STORE_OUTPUT  0,  4
+  STORE_OUTPUT  8,  5
+  STORE_OUTPUT 16,  3
+  STORE_OUTPUT 24,  0
+  STORE_OUTPUT 32, 10
+  STORE_OUTPUT 40,  9
+  STORE_OUTPUT 48,  7
+  STORE_OUTPUT 56,  2
 
   RET
 %endif
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
index 60446b086..99f17ebdf 100644
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm
@@ -13,6 +13,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref,
 ;                                            int ref_stride,
 ;                                            unsigned char *src,
diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
index a99c0b40e..2a018c1cf 100644
--- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c
@@ -11,8 +11,9 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref,
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
index 133640eb7..e5e3238d5 100644
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -11,8 +11,11 @@
 #include <immintrin.h>
 #include <string.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
 
 // -----------------------------------------------------------------------------
 // Copy and average
@@ -100,103 +103,258 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
-                                  int width, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m256i s[8], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
-      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
-      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
-      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
-      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    __m256i p0, p1, u0, u1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1, u0, u1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm256_loadu_si256((const __m256i *)dst);
-      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
-
-      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
-      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
-                          _mm256_avg_epu16(p1, u1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1, u0, u1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm_loadu_si128((const __m128i *)dst);
-      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
-
-      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
-      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1, u0, u1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-      src += src_stride << 1;
-      u0 = _mm_loadl_epi64((const __m128i *)dst);
-      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
-
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
-      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
-      dst += dst_stride << 1;
-      h -= 2;
-    } while (h > 0);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   InterpFilterParams *filter_params_x,
+                                   InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[4], coeffs_x[4];
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+                                  round_shift_bits);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+                                 round_shift_bits);
+
+      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+      res = _mm256_min_epi16(res, clip_pixel);
+      res = _mm256_max_epi16(res, zero);
+
+      if (w - j > 4) {
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else if (w == 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else {
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                     _mm256_castsi256_si128(res));
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                     _mm256_extracti128_si256(res, 1));
+      }
+    }
   }
 }
 
+#define CONV8_ROUNDING_BITS (7)
+
 // -----------------------------------------------------------------------------
 // Horizontal and vertical filtering
 
-#define CONV8_ROUNDING_BITS (7)
-
 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
@@ -817,250 +975,6 @@ static void aom_highbd_filter_block1d8_v2_avx2(
   } while (height > 0);
 }
 
-// Calculation with averaging the input pixels
-
-static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
-                                        uint16_t *dst) {
-  const __m128i a0 = _mm256_castsi256_si128(*y0);
-  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
-  __m128i res = _mm_packus_epi32(a0, a1);
-  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
-  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
-  res = _mm_avg_epu16(res, pix);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                        const __m256i *mask, uint16_t *dst,
-                                        ptrdiff_t pitch) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
-  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
-  const __m256i pix =
-      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
-  a = _mm256_min_epi16(a, *mask);
-  a = _mm256_avg_epu16(a, pix);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
-  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
-}
-
-static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                         const __m256i *mask, uint16_t *dst) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
-  a = _mm256_min_epi16(a, *mask);
-  a = _mm256_avg_epu16(a, pix);
-  _mm256_storeu_si256((__m256i *)dst, a);
-}
-
-static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
-                                         const __m256i *mask, uint16_t *dst,
-                                         ptrdiff_t pitch) {
-  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
-  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
-  __m256i p = _mm256_min_epi16(*y0, *mask);
-  p = _mm256_avg_epu16(p, pix0);
-  _mm256_storeu_si256((__m256i *)dst, p);
-
-  p = _mm256_min_epi16(*y1, *mask);
-  p = _mm256_avg_epu16(p, pix1);
-  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
-}
-
-static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
-                                               const __m128i *y1,
-                                               const __m128i *mask,
-                                               uint16_t *dst) {
-  __m128i res = _mm_packus_epi32(*y0, *y1);
-  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
-  res = _mm_min_epi16(res, *mask);
-  res = _mm_avg_epu16(res, pix);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static void aom_highbd_filter_block1d8_h8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_pixels(src_ptr, src_pitch, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    store_8x1_avg_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_v8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[9], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_8x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_8x9_pixels(src_ptr, src_pitch, signal);
-
-    filter_8x9_pixels(signal, ff, &res0, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d16_v8_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[17], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_16x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_16x9_pixels(src_ptr, src_pitch, signal);
-    filter_16x9_pixels(signal, ff, &res0, &res1);
-    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_16x9_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_h2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_2t_pixels(src_ptr, signal);
-    filter_8x1_2t_pixels(signal, &ff, &res0);
-    store_8x1_avg_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_2t_pixels(src_ptr, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d16_v2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[3], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-  __m256i ff;
-
-  pack_2t_filter(filter, &ff);
-  pack_16x2_init(src_ptr, signal);
-
-  do {
-    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-static void aom_highbd_filter_block1d8_v2_avg_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m128i signal[3], res0, res1;
-  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
-  __m128i ff;
-
-  pack_8x1_2t_filter(filter, &ff);
-  pack_8x2_init(src_ptr, signal);
-
-  do {
-    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
-    filter_8_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
                                         ptrdiff_t, uint32_t, const int16_t *,
                                         int);
@@ -1080,32 +994,5 @@ void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
 
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
-
-void aom_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void aom_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-#define aom_highbd_filter_block1d4_h8_avg_avx2 \
-  aom_highbd_filter_block1d4_h8_avg_sse2
-#define aom_highbd_filter_block1d4_h2_avg_avx2 \
-  aom_highbd_filter_block1d4_h2_avg_sse2
-#define aom_highbd_filter_block1d4_v8_avg_avx2 \
-  aom_highbd_filter_block1d4_v8_avg_sse2
-#define aom_highbd_filter_block1d4_v2_avg_avx2 \
-  aom_highbd_filter_block1d4_v2_avg_sse2
-
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
 
 #undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 000000000..f7ac9b496
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m128i s[16], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m128i s[4], coeffs_x[4];
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                 round_shift_bits);
+        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                round_shift_bits);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        res = _mm_min_epi16(res, clip_pixel);
+        res = _mm_max_epi16(res, zero);
+
+        if (w - j > 4) {
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        } else if (w == 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+        } else {
+          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
deleted file mode 100644
index e001a1d70..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "aom_ports/msvc.h"
-#include "./aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-// D45E_PRED
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
-                                 const __m256i *z) {
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i a = _mm256_avg_epu16(*x, *z);
-  const __m256i b =
-      _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
-  return _mm256_avg_epu16(b, *y);
-}
-
-static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
-                            const __m256i *a2, uint16_t **dst,
-                            ptrdiff_t stride) {
-  const __m256i y = avg3_epu16(a0, a1, a2);
-  _mm256_storeu_si256((__m256i *)*dst, y);
-  *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 9);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
-  x0 = _mm256_insert_epi16(x0, above[23], 15);
-  const __m256i y = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 15);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
-  d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  x2 = _mm256_insert_epi16(x2, above[31], 15);
-  const __m256i y = avg3_epu16(&x0, &x1, &x2);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-
-  d45e_w16(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
-    d45e_w16(&x0, &x1, &x2, &dst, stride);
-  } while (i < 33);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
-  x0 = _mm256_insert_epi16(x0, above[47], 15);
-  const __m256i y = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
-
-  uint16_t *dst1 = dst;
-  uint16_t *dst2 = dst + 16;
-
-  d45e_w16(&x0, &x1, &x2, &dst1, stride);
-  d45e_w16(&y0, &y1, &y2, &dst2, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x1, &x2, &x0, &dst1, stride);
-    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x2, &x0, &x1, &dst1, stride);
-    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x0, &x1, &x2, &dst1, stride);
-    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y0, &y1, &y2, &dst2, stride);
-  } while (i < 15);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
-  d45e_w16(&x1, &x2, &x0, &dst1, stride);
-  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
-  d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-  x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  d45e_w16(&x2, &x0, &x1, &dst1, stride);
-  y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
-  d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-  x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i u = avg3_epu16(&x0, &x1, &x2);
-  _mm256_storeu_si256((__m256i *)dst1, u);
-
-  y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
-  y2 = _mm256_insert_epi16(y2, above[47], 15);
-  u = avg3_epu16(&y0, &y1, &y2);
-  _mm256_storeu_si256((__m256i *)dst2, u);
-}
-
-void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m256i x0 = _mm256_loadu_si256((const __m256i *)above);
-  __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
-  __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
-  __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
-  __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
-  __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
-
-  uint16_t *dst1 = dst;
-  uint16_t *dst2 = dst + 16;
-
-  d45e_w16(&x0, &x1, &x2, &dst1, stride);
-  d45e_w16(&y0, &y1, &y2, &dst2, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x1, &x2, &x0, &dst1, stride);
-    y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y1, &y2, &y0, &dst2, stride);
-
-    x1 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x2, &x0, &x1, &dst1, stride);
-    y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y2, &y0, &y1, &dst2, stride);
-
-    x2 = _mm256_loadu_si256((const __m256i *)(above + i));
-    d45e_w16(&x0, &x1, &x2, &dst1, stride);
-    y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
-    d45e_w16(&y0, &y1, &y2, &dst2, stride);
-  } while (i < 33);
-
-  x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
-  __m256i u = avg3_epu16(&x1, &x2, &x0);
-  _mm256_storeu_si256((__m256i *)dst1, u);
-
-  y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
-  y0 = _mm256_insert_epi16(y0, above[63], 15);
-  u = avg3_epu16(&y1, &y2, &y0);
-  _mm256_storeu_si256((__m256i *)dst2, u);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
index 691e166cf..5a55736c4 100644
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -11,7 +11,7 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
 // -----------------------------------------------------------------------------
 // H_PRED
@@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
     dst += stride;
   }
 }
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
-                                 const __m128i *z) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a = _mm_avg_epu16(*x, *z);
-  const __m128i b =
-      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
-  return _mm_avg_epu16(b, *y);
-}
-
-void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
-  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
-  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
-  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
-  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
-  const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
-  const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
-  const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
-  const __m128i row0 = _mm_srli_si128(avg2, 6);
-  const __m128i row1 = _mm_srli_si128(avg3, 4);
-  const __m128i row2 = _mm_srli_si128(avg2, 4);
-  const __m128i row3 = _mm_srli_si128(avg3, 2);
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-
-  dst -= stride;
-  dst[0] = _mm_extract_epi16(avg3, 1);
-  dst[stride] = _mm_extract_epi16(avg3, 0);
-}
-
-void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
-  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
-  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
-  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
-  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
-  const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
-  const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
-  const __m128i row0 = _mm_srli_si128(avg3, 6);
-  const __m128i row1 = _mm_srli_si128(avg3, 4);
-  const __m128i row2 = _mm_srli_si128(avg3, 2);
-  const __m128i row3 = avg3;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const int I = left[0];
-  const int J = left[1];
-  const int K = left[2];
-  const int L = left[3];
-  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
-  const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
-  const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
-  const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
-  const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
-  const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
-  const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
-  const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
-  const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
-  const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
-  const __m128i row2 = _mm_srli_si128(row3, 4);
-  const __m128i row1 = _mm_srli_si128(row3, 8);
-  const __m128i row0 = _mm_srli_si128(avg3, 4);
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst[0] = _mm_extract_epi16(avg2, 3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
-  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
-  __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
-  CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
-  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
-  (void)left;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-}
-
-void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i h76543210 = _mm_load_si128((const __m128i *)above);
-  __m128i hx7654321 = _mm_srli_si128(h76543210, 2);
-  __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
-  __m128i hx8765432 = _mm_srli_si128(h87654321, 2);
-  __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
-  __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
-  dst += stride;
-
-  // hcba98765
-  h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
-  h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
-  // hxcba9876
-  hx7654321 = _mm_srli_si128(h76543210, 2);
-  // hxxcba987
-  hx8765432 = _mm_srli_si128(h76543210, 4);
-  avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
-  _mm_storel_epi64((__m128i *)dst, avg3);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
-}
-
-void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-  __m128i y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 3));
-  y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x1 = _mm_loadu_si128((const __m128i *)(above + 4));
-  y = avg3_epu16(&x2, &x0, &x1);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x2 = _mm_loadu_si128((const __m128i *)(above + 5));
-  x2 = _mm_insert_epi16(x2, above[11], 7);
-  y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-}
-
-static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
-                           const __m128i *a2, uint16_t **dst,
-                           ptrdiff_t stride) {
-  const __m128i y = avg3_epu16(a0, a1, a2);
-  _mm_storeu_si128((__m128i *)*dst, y);
-  *dst += stride;
-}
-
-void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
-  d45e_w8(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x0, &x1, &x2, &dst, stride);
-  } while (i < 9);
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 9));
-  x0 = _mm_insert_epi16(x0, above[15], 7);
-  const __m128i y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-}
-
-void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  __m128i x0 = _mm_load_si128((const __m128i *)above);
-  __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
-  __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
-
-  d45e_w8(&x0, &x1, &x2, &dst, stride);
-
-  int i = 3;
-  do {
-    x0 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x1, &x2, &x0, &dst, stride);
-
-    x1 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x2, &x0, &x1, &dst, stride);
-
-    x2 = _mm_loadu_si128((const __m128i *)(above + i++));
-    d45e_w8(&x0, &x1, &x2, &dst, stride);
-  } while (i < 15);
-
-  x0 = _mm_loadu_si128((const __m128i *)(above + 15));
-  __m128i y = avg3_epu16(&x1, &x2, &x0);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x1 = _mm_loadu_si128((const __m128i *)(above + 16));
-  y = avg3_epu16(&x2, &x0, &x1);
-  _mm_store_si128((__m128i *)dst, y);
-  dst += stride;
-
-  x2 = _mm_loadu_si128((const __m128i *)(above + 17));
-  x2 = _mm_insert_epi16(x2, above[23], 7);
-  y = avg3_epu16(&x0, &x1, &x2);
-  _mm_store_si128((__m128i *)dst, y);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
deleted file mode 100644
index b089a3f43..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-/*
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-*/
-static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
-                                 const __m128i *z) {
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a = _mm_avg_epu16(*x, *z);
-  const __m128i b =
-      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
-  return _mm_avg_epu16(b, *y);
-}
-
-DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
-  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
-};
-
-static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
-  *a = _mm_shuffle_epi8(*a, *rotrw);
-  return *a;
-}
-
-void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i IXABCDEF =
-      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
-  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
-  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
-  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
-  __m128i rowa = avg2;
-  __m128i rowb = avg3;
-  int i;
-  (void)bd;
-  for (i = 0; i < 8; i += 2) {
-    _mm_store_si128((__m128i *)dst, rowa);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, rowb);
-    dst += stride;
-    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
-    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
-  }
-}
-
-void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A0 = _mm_load_si128((const __m128i *)above);
-  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
-  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
-  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
-  const __m128i L1_ = _mm_srli_si128(L1, 2);
-  __m128i rowa_0 = avg2_0;
-  __m128i rowa_1 = avg2_1;
-  __m128i rowb_0 = avg3_0;
-  __m128i rowb_1 = avg3_1;
-  __m128i avg3_left[2];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
-  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
-  for (i = 0; i < 2; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; j += 2) {
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      dst += stride;
-      _mm_store_si128((__m128i *)dst, rowb_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
-      dst += stride;
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
-      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
-    }
-  }
-}
-
-void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_load_si128((const __m128i *)above);
-  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
-  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
-  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
-  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
-  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
-  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
-  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
-  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
-  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
-  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
-  const __m128i L3_ = _mm_srli_si128(L3, 2);
-  __m128i rowa_0 = avg2_0;
-  __m128i rowa_1 = avg2_1;
-  __m128i rowa_2 = avg2_2;
-  __m128i rowa_3 = avg2_3;
-  __m128i rowb_0 = avg3_0;
-  __m128i rowb_1 = avg3_1;
-  __m128i rowb_2 = avg3_2;
-  __m128i rowb_3 = avg3_3;
-  __m128i avg3_left[4];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
-  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
-  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
-  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
-  for (i = 0; i < 4; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; j += 2) {
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
-      dst += stride;
-      _mm_store_si128((__m128i *)dst, rowb_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
-      dst += stride;
-      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
-      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
-      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
-      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
-      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
-    }
-  }
-}
-
-void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
-  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i AXIJKLMN =
-      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
-  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
-  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
-  __m128i rowa = avg3;
-  int i;
-  (void)bd;
-  for (i = 0; i < 8; ++i) {
-    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
-    _mm_store_si128((__m128i *)dst, rowa);
-    dst += stride;
-  }
-}
-
-void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i B0 = _mm_load_si128((const __m128i *)above);
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
-  const __m128i C1 = _mm_srli_si128(B1, 2);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
-  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
-  __m128i rowa_0 = avg3_0;
-  __m128i rowa_1 = avg3_1;
-  __m128i avg3_left[2];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
-  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
-  for (i = 0; i < 2; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; ++j) {
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i B0 = _mm_load_si128((const __m128i *)above);
-  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
-  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
-  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
-  const __m128i C3 = _mm_srli_si128(B3, 2);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
-  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
-  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
-  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
-  __m128i rowa_0 = avg3_0;
-  __m128i rowa_1 = avg3_1;
-  __m128i rowa_2 = avg3_2;
-  __m128i rowa_3 = avg3_3;
-  __m128i avg3_left[4];
-  int i, j;
-  (void)bd;
-  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
-  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
-  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
-  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
-  for (i = 0; i < 4; ++i) {
-    __m128i avg_left = avg3_left[i];
-    for (j = 0; j < 8; ++j) {
-      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
-      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
-      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
-      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
-      _mm_store_si128((__m128i *)dst, rowa_0);
-      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
-      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
-      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                         const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
-  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
-  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
-  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
-  const __m128i XIJKLMNO =
-      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
-  const __m128i AXIJKLMN =
-      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
-  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
-  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
-  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
-  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
-  const __m128i row0 =
-      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
-  const __m128i row1 =
-      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
-  const __m128i row2 =
-      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
-  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
-  const __m128i row4 =
-      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
-  const __m128i row5 =
-      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
-  const __m128i row6 =
-      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
-  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, row0);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row1);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row2);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row3);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row4);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row5);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row6);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row7);
-}
-
-void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
-  const __m128i B1 = _mm_srli_si128(A1, 2);
-  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
-  const __m128i C1 = _mm_srli_si128(A1, 4);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
-  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
-  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
-  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
-  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
-  __m128i row_0 = avg3_0;
-  __m128i row_1 = avg3_1;
-  __m128i avg2_avg3_left[2][2];
-  int i, j;
-  (void)bd;
-
-  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
-
-  for (j = 0; j < 2; ++j) {
-    for (i = 0; i < 2; ++i) {
-      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      dst += stride;
-    }
-  }
-}
-
-void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
-  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
-  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
-  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
-  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
-  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
-  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
-  const __m128i B3 = _mm_srli_si128(A3, 2);
-  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
-  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
-  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
-  const __m128i C3 = _mm_srli_si128(A3, 4);
-  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
-  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
-  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
-  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
-  const __m128i L0 = _mm_load_si128((const __m128i *)left);
-  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
-  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
-  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
-  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
-  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
-  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
-  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
-  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
-  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
-  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
-  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
-  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
-  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
-  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
-  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
-  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
-  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
-  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
-  __m128i row_0 = avg3_0;
-  __m128i row_1 = avg3_1;
-  __m128i row_2 = avg3_2;
-  __m128i row_3 = avg3_3;
-  __m128i avg2_avg3_left[4][2];
-  int i, j;
-  (void)bd;
-
-  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
-  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
-  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
-  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
-  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
-  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
-
-  for (j = 0; j < 4; ++j) {
-    for (i = 0; i < 2; ++i) {
-      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
-      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
-      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
-      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
-      _mm_store_si128((__m128i *)dst, row_0);
-      _mm_store_si128((__m128i *)(dst + 8), row_1);
-      _mm_store_si128((__m128i *)(dst + 16), row_2);
-      _mm_store_si128((__m128i *)(dst + 24), row_3);
-      dst += stride;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
index 94c68885c..c954da94e 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -11,210 +11,26 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/x86/common_avx2.h"
 #include "aom_dsp/x86/lpf_common_sse2.h"
 #include "aom/aom_integer.h"
 
-#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
-                             const uint8_t *t, int bd, __m256i *blt,
-                             __m256i *lt, __m256i *thr) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
-  __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *blt = _mm256_slli_epi16(y, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
-  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *lt = _mm256_slli_epi16(y, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
-  y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  *thr = _mm256_slli_epi16(y, shift);
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
-                                     __m256i *p, __m256i *q) {
-  int i;
-  for (i = 0; i < size; i++) {
-    p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch));
-    q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch));
-  }
-}
-
-static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q,
-                                   const __m256i *t, __m256i *hev) {
-  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0]));
-  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0]));
-  __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm256_subs_epu16(h, *t);
-
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  const __m256i zero = _mm256_setzero_si256();
-  *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff);
-}
-
-static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q,
-                                      const __m256i *l, const __m256i *bl,
-                                      __m256i *mask) {
-  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0]));
-  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1]));
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff);
-  max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one));
-
-  int i;
-  for (i = 1; i < 4; ++i) {
-    max = _mm256_max_epi16(max,
-                           _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1])));
-    max = _mm256_max_epi16(max,
-                           _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1])));
-  }
-  max = _mm256_subs_epu16(max, *l);
-  *mask = _mm256_cmpeq_epi16(max, zero);  // return ~mask
-}
-
-static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p,
-                                      const __m256i *q, int bd, int start,
-                                      int end, __m256i *flat) {
-  __m256i max = _mm256_setzero_si256();
-  int i;
-  for (i = start; i < end; ++i) {
-    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0])));
-    max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0])));
-  }
-
-  __m256i ft;
-  if (bd == 8)
-    ft = _mm256_subs_epu16(max, *th);
-  else if (bd == 10)
-    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2));
-  else  // bd == 12
-    ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4));
-
-  const __m256i zero = _mm256_setzero_si256();
-  *flat = _mm256_cmpeq_epi16(ft, zero);
-}
-
-// Note:
-//  Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p,
-                                     const __m256i *q, __m256i *flat, int bd) {
-  // check the distance 1,2,3 against 0
-  flat_mask_internal(th, p, q, bd, 1, 4, flat);
-}
-
-// Note:
-//  access p[7-4], p[0], and q[7-4], q[0]
-static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p,
-                                     const __m256i *q, __m256i *flat, int bd) {
-  flat_mask_internal(th, p, q, bd, 4, 8, flat);
-}
-
-static INLINE void pixel_clamp(const __m256i *min, const __m256i *max,
-                               __m256i *pixel) {
-  __m256i clamped, mask;
-
-  mask = _mm256_cmpgt_epi16(*pixel, *max);
-  clamped = _mm256_andnot_si256(mask, *pixel);
-  mask = _mm256_and_si256(mask, *max);
-  clamped = _mm256_or_si256(mask, clamped);
-
-  mask = _mm256_cmpgt_epi16(clamped, *min);
-  clamped = _mm256_and_si256(mask, clamped);
-  mask = _mm256_andnot_si256(mask, *min);
-  *pixel = _mm256_or_si256(clamped, mask);
-}
-
-static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
-                                  const __m256i *th, int bd, __m256i *ps,
-                                  __m256i *qs) {
-  __m256i t80;
-  if (bd == 8)
-    t80 = _mm256_set1_epi16(0x80);
-  else if (bd == 10)
-    t80 = _mm256_set1_epi16(0x200);
-  else  // bd == 12
-    t80 = _mm256_set1_epi16(0x800);
-
-  __m256i ps0 = _mm256_subs_epi16(p[0], t80);
-  __m256i ps1 = _mm256_subs_epi16(p[1], t80);
-  __m256i qs0 = _mm256_subs_epi16(q[0], t80);
-  __m256i qs1 = _mm256_subs_epi16(q[1], t80);
-
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filter = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filter);
-
-  __m256i hev;
-  highbd_hev_mask(p, q, th, &hev);
-  filter = _mm256_and_si256(filter, hev);
-
-  const __m256i x = _mm256_subs_epi16(qs0, ps0);
-  filter = _mm256_adds_epi16(filter, x);
-  filter = _mm256_adds_epi16(filter, x);
-  filter = _mm256_adds_epi16(filter, x);
-  pixel_clamp(&pmin, &pmax, &filter);
-  filter = _mm256_and_si256(filter, *mask);
-
-  const __m256i t3 = _mm256_set1_epi16(3);
-  const __m256i t4 = _mm256_set1_epi16(4);
-
-  __m256i filter1 = _mm256_adds_epi16(filter, t4);
-  __m256i filter2 = _mm256_adds_epi16(filter, t3);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter1 = _mm256_srai_epi16(filter1, 3);
-  filter2 = _mm256_srai_epi16(filter2, 3);
-
-  qs0 = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &qs0);
-  ps0 = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &ps0);
-
-  qs[0] = _mm256_adds_epi16(qs0, t80);
-  ps[0] = _mm256_adds_epi16(ps0, t80);
-
-  filter = _mm256_adds_epi16(filter1, one);
-  filter = _mm256_srai_epi16(filter, 1);
-  filter = _mm256_andnot_si256(hev, filter);
-
-  qs1 = _mm256_subs_epi16(qs1, filter);
-  pixel_clamp(&pmin, &pmax, &qs1);
-  ps1 = _mm256_adds_epi16(ps1, filter);
-  pixel_clamp(&pmin, &pmax, &ps1);
-
-  qs[1] = _mm256_adds_epi16(qs1, t80);
-  ps[1] = _mm256_adds_epi16(ps1, t80);
-}
-#endif  // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
-
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
 }
 
-void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blt, const uint8_t *lt,
-                                          const uint8_t *thr, int bd) {
-  aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
 }
 
 void aom_highbd_lpf_horizontal_4_dual_avx2(
@@ -248,626 +64,3 @@ void aom_highbd_lpf_vertical_8_dual_avx2(
   aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
                                       limit1, thresh1, bd);
 }
-#else
-void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
-                                            const uint8_t *blt,
-                                            const uint8_t *lt,
-                                            const uint8_t *thr, int bd) {
-  __m256i blimit, limit, thresh;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
-
-  __m256i p[8], q[8];
-  load_highbd_pixel(s, 8, pitch, p, q);
-
-  __m256i mask;
-  highbd_filter_mask(p, q, &limit, &blimit, &mask);
-
-  __m256i flat, flat2;
-  const __m256i one = _mm256_set1_epi16(1);
-  highbd_flat_mask4(&one, p, q, &flat, bd);
-  highbd_flat_mask5(&one, p, q, &flat2, bd);
-
-  flat = _mm256_and_si256(flat, mask);
-  flat2 = _mm256_and_si256(flat2, flat);
-
-  __m256i ps[2], qs[2];
-  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
-
-  // flat and wide flat calculations
-  __m256i flat_p[3], flat_q[3];
-  __m256i flat2_p[7], flat2_q[7];
-  {
-    const __m256i eight = _mm256_set1_epi16(8);
-    const __m256i four = _mm256_set1_epi16(4);
-
-    __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]),
-                                     _mm256_add_epi16(p[4], p[3]));
-    __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]),
-                                     _mm256_add_epi16(q[4], q[3]));
-
-    __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1]));
-    sum_p = _mm256_add_epi16(sum_p, sum_lp);
-
-    __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1]));
-    sum_q = _mm256_add_epi16(sum_q, sum_lq);
-    sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q));
-    sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq));
-
-    flat2_p[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4);
-    flat2_q[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4);
-    flat_p[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3);
-    flat_q[0] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3);
-
-    __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]);
-    __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]);
-    __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]);
-    __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]);
-
-    sum_q = _mm256_sub_epi16(sum_p, p[6]);
-    sum_p = _mm256_sub_epi16(sum_p, q[6]);
-    flat2_p[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4);
-    flat2_q[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4);
-
-    sum_lq = _mm256_sub_epi16(sum_lp, p[2]);
-    sum_lp = _mm256_sub_epi16(sum_lp, q[2]);
-    flat_p[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3);
-    flat_q[1] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3);
-
-    sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
-    sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
-    sum_p3 = _mm256_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm256_add_epi16(sum_q3, q[3]);
-
-    sum_p = _mm256_sub_epi16(sum_p, q[5]);
-    sum_q = _mm256_sub_epi16(sum_q, p[5]);
-    flat2_p[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4);
-    flat2_q[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4);
-
-    sum_lp = _mm256_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm256_sub_epi16(sum_lq, p[1]);
-    flat_p[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3);
-    flat_q[2] = _mm256_srli_epi16(
-        _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3);
-
-    int i;
-    for (i = 3; i < 7; ++i) {
-      sum_p7 = _mm256_add_epi16(sum_p7, p[7]);
-      sum_q7 = _mm256_add_epi16(sum_q7, q[7]);
-      sum_p = _mm256_sub_epi16(sum_p, q[7 - i]);
-      sum_q = _mm256_sub_epi16(sum_q, p[7 - i]);
-      flat2_p[i] = _mm256_srli_epi16(
-          _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4);
-      flat2_q[i] = _mm256_srli_epi16(
-          _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4);
-    }
-  }
-
-  // highbd_filter8
-  p[2] = _mm256_andnot_si256(flat, p[2]);
-  //  p2 remains unchanged if !(flat && mask)
-  flat_p[2] = _mm256_and_si256(flat, flat_p[2]);
-  //  when (flat && mask)
-  p[2] = _mm256_or_si256(p[2], flat_p[2]);  // full list of p2 values
-  q[2] = _mm256_andnot_si256(flat, q[2]);
-  flat_q[2] = _mm256_and_si256(flat, flat_q[2]);
-  q[2] = _mm256_or_si256(q[2], flat_q[2]);  // full list of q2 values
-
-  int i;
-  for (i = 1; i >= 0; i--) {
-    ps[i] = _mm256_andnot_si256(flat, ps[i]);
-    flat_p[i] = _mm256_and_si256(flat, flat_p[i]);
-    p[i] = _mm256_or_si256(ps[i], flat_p[i]);
-    qs[i] = _mm256_andnot_si256(flat, qs[i]);
-    flat_q[i] = _mm256_and_si256(flat, flat_q[i]);
-    q[i] = _mm256_or_si256(qs[i], flat_q[i]);
-  }
-
-  // highbd_filter16
-
-  for (i = 6; i >= 0; i--) {
-    //  p[i] remains unchanged if !(flat2 && flat && mask)
-    p[i] = _mm256_andnot_si256(flat2, p[i]);
-    flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]);
-    //  get values for when (flat2 && flat && mask)
-    p[i] = _mm256_or_si256(p[i], flat2_p[i]);  // full list of p values
-
-    q[i] = _mm256_andnot_si256(flat2, q[i]);
-    flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]);
-    q[i] = _mm256_or_si256(q[i], flat2_q[i]);
-    _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]);
-    _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]);
-  }
-}
-
-static INLINE void highbd_transpose16x16(uint16_t *src, int src_p,
-                                         uint16_t *dst, int dst_p) {
-  __m256i x[16];
-  int i;
-  for (i = 0; i < 16; ++i) {
-    x[i] = _mm256_loadu_si256((const __m256i *)src);
-    src += src_p;
-  }
-  mm256_transpose_16x16(x, x);
-  for (i = 0; i < 16; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, x[i]);
-    dst += dst_p;
-  }
-}
-
-void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
-
-  //  Transpose 16x16
-  highbd_transpose16x16(s - 8, p, t_dst, 16);
-
-  //  Loop filtering
-  aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, bd);
-
-  //  Transpose back
-  highbd_transpose16x16(t_dst, 16, s - 8, p);
-}
-
-static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0,
-                                  const uint8_t *t0, const uint8_t *b1,
-                                  const uint8_t *l1, const uint8_t *t1, int bd,
-                                  __m256i *blt, __m256i *lt, __m256i *thr) {
-  const __m128i z128 = _mm_setzero_si128();
-  const __m128i blimit0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128);
-  const __m128i limit0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128);
-  const __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128);
-  const __m128i blimit1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128);
-  const __m128i limit1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128);
-  const __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128);
-
-  *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1);
-  *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1);
-  *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1);
-
-  int shift = bd - 8;
-  *blt = _mm256_slli_epi16(*blt, shift);
-  *lt = _mm256_slli_epi16(*lt, shift);
-  *thr = _mm256_slli_epi16(*thr, shift);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
-  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
-  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
-  __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p));
-  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
-  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
-
-  const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
-  const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
-
-  __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
-  __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
-
-  __m256i blimit, limit, thresh;
-  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit, &limit, &thresh);
-
-  __m256i t80, tff80, tffe0, t1f, t7f;
-  if (bd == 8) {
-    t80 = _mm256_set1_epi16(0x80);
-    tff80 = _mm256_set1_epi16(0xff80);
-    tffe0 = _mm256_set1_epi16(0xffe0);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8);
-  } else if (bd == 10) {
-    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2);
-    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2);
-    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6);
-  } else {  // bd == 12
-    t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4);
-    tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4);
-    tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4);
-    t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4);
-    t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4);
-  }
-
-  __m256i ps1 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80);
-  __m256i ps0 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80);
-  __m256i qs0 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80);
-  __m256i qs1 =
-      _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80);
-
-  // filter_mask and hev_mask
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  __m256i hev = _mm256_subs_epu16(flat, thresh);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-  __m256i mask =
-      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  const __m256i one = _mm256_set1_epi16(1);
-  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
-  mask = _mm256_max_epi16(flat, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-  __m256i work = _mm256_max_epi16(
-      _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)),
-      _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3)));
-  mask = _mm256_max_epi16(work, mask);
-  work = _mm256_max_epi16(
-      _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)),
-      _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3)));
-  mask = _mm256_max_epi16(work, mask);
-  mask = _mm256_subs_epu16(mask, limit);
-  mask = _mm256_cmpeq_epi16(mask, zero);
-
-  // filter4
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filt = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, hev);
-  __m256i work_a = _mm256_subs_epi16(qs0, ps0);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  pixel_clamp(&pmin, &pmax, &filt);
-
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm256_and_si256(filt, mask);
-
-  const __m256i t4 = _mm256_set1_epi16(4);
-  const __m256i t3 = _mm256_set1_epi16(3);
-
-  __m256i filter1 = _mm256_adds_epi16(filt, t4);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  __m256i filter2 = _mm256_adds_epi16(filt, t3);
-  pixel_clamp(&pmin, &pmax, &filter2);
-
-  // Filter1 >> 3
-  work_a = _mm256_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm256_srli_epi16(filter1, 3);
-  work_a = _mm256_and_si256(work_a, tffe0);    // sign bits for the values < 0
-  filter1 = _mm256_and_si256(filter1, t1f);    // clamp the range
-  filter1 = _mm256_or_si256(filter1, work_a);  // reinsert the sign bits
-
-  // Filter2 >> 3
-  work_a = _mm256_cmpgt_epi16(zero, filter2);
-  filter2 = _mm256_srli_epi16(filter2, 3);
-  work_a = _mm256_and_si256(work_a, tffe0);
-  filter2 = _mm256_and_si256(filter2, t1f);
-  filter2 = _mm256_or_si256(filter2, work_a);
-
-  // filt >> 1
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  filt = _mm256_adds_epi16(filter1, one);
-  work_a = _mm256_cmpgt_epi16(zero, filt);
-  filt = _mm256_srli_epi16(filt, 1);
-  work_a = _mm256_and_si256(work_a, tff80);
-  filt = _mm256_and_si256(filt, t7f);
-  filt = _mm256_or_si256(filt, work_a);
-
-  filt = _mm256_andnot_si256(hev, filt);
-
-  filter1 = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  q0 = _mm256_adds_epi16(filter1, t80);
-
-  filter1 = _mm256_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  q1 = _mm256_adds_epi16(filter1, t80);
-
-  filter2 = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  p0 = _mm256_adds_epi16(filter2, t80);
-
-  filter2 = _mm256_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  p1 = _mm256_adds_epi16(filter2, t80);
-
-  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
-  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
-  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
-  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-
-  __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p));
-  __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p));
-  __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p));
-  __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p));
-  __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p));
-  __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p));
-
-  __m256i blimit, limit, thresh;
-  get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit, &limit, &thresh);
-
-  __m256i t80;
-  if (bd == 8) {
-    t80 = _mm256_set1_epi16(0x80);
-  } else if (bd == 10) {
-    t80 = _mm256_set1_epi16(0x200);
-  } else {  // bd == 12
-    t80 = _mm256_set1_epi16(0x800);
-  }
-
-  __m256i ps1, ps0, qs0, qs1;
-  ps1 = _mm256_subs_epi16(p1, t80);
-  ps0 = _mm256_subs_epi16(p0, t80);
-  qs0 = _mm256_subs_epi16(q0, t80);
-  qs1 = _mm256_subs_epi16(q1, t80);
-
-  // filter_mask and hev_mask
-  __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0));
-  abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0));
-
-  abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0));
-  abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1));
-  __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0);
-  __m256i hev = _mm256_subs_epu16(flat, thresh);
-  const __m256i zero = _mm256_set1_epi16(0);
-  const __m256i ffff = _mm256_set1_epi16(0xFFFF);
-  hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff);
-
-  abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1);
-  __m256i mask =
-      _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit);
-  mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-
-  const __m256i one = _mm256_set1_epi16(1);
-  mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one));
-  mask = _mm256_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm256_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q1)));
-  mask = _mm256_max_epi16(work, mask);
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q2)));
-  mask = _mm256_max_epi16(work, mask);
-  mask = _mm256_subs_epu16(mask, limit);
-  mask = _mm256_cmpeq_epi16(mask, zero);
-
-  // flat_mask4
-  flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q2, q0)));
-  work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)),
-                          _mm256_abs_epi16(_mm256_sub_epi16(q3, q0)));
-  flat = _mm256_max_epi16(work, flat);
-  flat = _mm256_max_epi16(abs_p1p0, flat);
-  flat = _mm256_max_epi16(abs_q1q0, flat);
-
-  if (bd == 8)
-    flat = _mm256_subs_epu16(flat, one);
-  else if (bd == 10)
-    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2));
-  else  // bd == 12
-    flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4));
-
-  flat = _mm256_cmpeq_epi16(flat, zero);
-  flat = _mm256_and_si256(flat, mask);  // flat & mask
-
-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
-  __m256i workp_a, workp_b, workp_shft;
-  workp_a =
-      _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1));
-  const __m256i four = _mm256_set1_epi16(4);
-  workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0);
-  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft);
-
-  workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft);
-
-  workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3);
-  _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft);
-
-  // lp filter
-  const __m256i pmax = _mm256_subs_epi16(
-      _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80);
-  const __m256i pmin = _mm256_subs_epi16(zero, t80);
-
-  __m256i filt, filter1, filter2, work_a;
-  filt = _mm256_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, hev);
-  work_a = _mm256_subs_epi16(qs0, ps0);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  filt = _mm256_adds_epi16(filt, work_a);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm256_and_si256(filt, mask);
-
-  const __m256i t4 = _mm256_set1_epi16(4);
-  const __m256i t3 = _mm256_set1_epi16(3);
-
-  filter1 = _mm256_adds_epi16(filt, t4);
-  filter2 = _mm256_adds_epi16(filt, t3);
-
-  // Filter1 >> 3
-  pixel_clamp(&pmin, &pmax, &filter1);
-  filter1 = _mm256_srai_epi16(filter1, 3);
-
-  // Filter2 >> 3
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter2 = _mm256_srai_epi16(filter2, 3);
-
-  // filt >> 1
-  filt = _mm256_adds_epi16(filter1, one);
-  filt = _mm256_srai_epi16(filt, 1);
-  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-  filt = _mm256_andnot_si256(hev, filt);
-
-  work_a = _mm256_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  q0 = _mm256_loadu_si256((__m256i *)flat_oq0);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q0 = _mm256_and_si256(flat, q0);
-  q0 = _mm256_or_si256(work_a, q0);
-
-  work_a = _mm256_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  q1 = _mm256_loadu_si256((__m256i *)flat_oq1);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q1 = _mm256_and_si256(flat, q1);
-  q1 = _mm256_or_si256(work_a, q1);
-
-  work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p));
-  q2 = _mm256_loadu_si256((__m256i *)flat_oq2);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  q2 = _mm256_and_si256(flat, q2);
-  q2 = _mm256_or_si256(work_a, q2);
-
-  work_a = _mm256_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  p0 = _mm256_loadu_si256((__m256i *)flat_op0);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p0 = _mm256_and_si256(flat, p0);
-  p0 = _mm256_or_si256(work_a, p0);
-
-  work_a = _mm256_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm256_adds_epi16(work_a, t80);
-  p1 = _mm256_loadu_si256((__m256i *)flat_op1);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p1 = _mm256_and_si256(flat, p1);
-  p1 = _mm256_or_si256(work_a, p1);
-
-  work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p));
-  p2 = _mm256_loadu_si256((__m256i *)flat_op2);
-  work_a = _mm256_andnot_si256(flat, work_a);
-  p2 = _mm256_and_si256(flat, p2);
-  p2 = _mm256_or_si256(work_a, p2);
-
-  _mm256_storeu_si256((__m256i *)(s - 3 * p), p2);
-  _mm256_storeu_si256((__m256i *)(s - 2 * p), p1);
-  _mm256_storeu_si256((__m256i *)(s - 1 * p), p0);
-  _mm256_storeu_si256((__m256i *)(s + 0 * p), q0);
-  _mm256_storeu_si256((__m256i *)(s + 1 * p), q1);
-  _mm256_storeu_si256((__m256i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
-
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
-
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
-
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
-#endif  // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 0a399edf2..83e0098ba 100644
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -11,29 +11,23 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom_ports/emmintrin_compat.h"
-#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
 
-static INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
-                               __m128i *pixel) {
-  __m128i clamped, mask;
+#include "aom_dsp/x86/lpf_common_sse2.h"
 
-  mask = _mm_cmpgt_epi16(*pixel, *max);
-  clamped = _mm_andnot_si128(mask, *pixel);
-  mask = _mm_and_si128(mask, *max);
-  clamped = _mm_or_si128(mask, clamped);
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+                                         __m128i *pixel) {
+  *pixel = _mm_min_epi16(*pixel, *max);
+  *pixel = _mm_max_epi16(*pixel, *min);
+}
 
-  mask = _mm_cmpgt_epi16(clamped, *min);
-  clamped = _mm_and_si128(mask, clamped);
-  mask = _mm_andnot_si128(mask, *min);
-  *pixel = _mm_or_si128(clamped, mask);
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
 }
 
 static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
                              const uint8_t *t, int bd, __m128i *blt,
-                             __m128i *lt, __m128i *thr) {
+                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
   const int shift = bd - 8;
   const __m128i zero = _mm_setzero_si128();
 
@@ -45,6 +39,36 @@ static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
 
   x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
   *thr = _mm_slli_epi16(x, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+    __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+  __m128i x1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *blt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *lt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *thr_out = _mm_slli_epi16(x0, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
 }
 
 static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
@@ -55,115 +79,217 @@ static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
     q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
   }
 }
-// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q,
-                                   const __m128i *t, __m128i *hev) {
-  const __m128i abs_p1p0 =
-      _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1]));
-  const __m128i abs_q1q0 =
-      _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1]));
-  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm_subs_epu16(h, *t);
 
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-  const __m128i zero = _mm_setzero_si128();
-  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-}
-
-static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q,
-                                      const __m128i *l, const __m128i *bl,
-                                      __m128i *mask) {
-  __m128i abs_p0q0 =
-      _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0]));
-  __m128i abs_p1q1 =
-      _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1]));
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+                                           const __m128i *l, const __m128i *bl,
+                                           __m128i *mask) {
+  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
 
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i ffff = _mm_set1_epi16(0xFFFF);
+
   __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
   max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
   max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
 
   int i;
   for (i = 1; i < 4; ++i) {
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]),
-                                          _mm_subs_epu16(p[i - 1], p[i])));
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]),
-                                          _mm_subs_epu16(q[i - 1], q[i])));
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
   }
   max = _mm_subs_epu16(max, *l);
   *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
 }
 
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p,
-                                      const __m128i *q, int bd, int start,
-                                      int end, __m128i *flat) {
-  __m128i max = _mm_setzero_si128();
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+                                                 __m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *abs_p1p0, __m128i *l,
+                                                 __m128i *bl, __m128i *t,
+                                                 __m128i *hev, __m128i *mask) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+  __m128i max, max01, h;
+
+  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
+
+  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+  // mask |= (abs(*p1 - *p0) > limit) * -1;
+  // mask |= (abs(*q1 - *q0) > limit) * -1;
+  h = _mm_subs_epu16(max01, *t);
+
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+  // replicate for the further "merged variables" usage
+  *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+  max = _mm_max_epi16(max, max01);
+  int i;
+  for (i = 2; i < x; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+                                      int start, int end, __m128i *flat) {
+  int i;
+  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+                              abs_diff16(pq[start + 1], pq[0]));
+
+  for (i = start + 2; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+                                           const __m128i *q, int start, int end,
+                                           __m128i *flat) {
   int i;
-  for (i = start; i < end; ++i) {
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]),
-                                          _mm_subs_epu16(p[0], p[i])));
-    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]),
-                                          _mm_subs_epu16(q[0], q[i])));
+  __m128i max =
+      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+  for (i = start + 1; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
   }
 
   __m128i ft;
-  if (bd == 8)
-    ft = _mm_subs_epu16(max, *th);
-  else if (bd == 10)
-    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2));
-  else  // bd == 12
-    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4));
+  ft = _mm_subs_epu16(max, *th);
 
   const __m128i zero = _mm_setzero_si128();
   *flat = _mm_cmpeq_epi16(ft, zero);
 }
 
-// Note:
-//  Access p[3-1], p[0], and q[3-1], q[0]
-static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
-                                     const __m128i *q, __m128i *flat, int bd) {
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+                                          __m128i *flat2, int bd) {
   // check the distance 1,2,3 against 0
-  flat_mask_internal(th, p, q, bd, 1, 4, flat);
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal(&th, pq, 1, 4, flat);
+  flat_mask_internal(&th, pq, 4, 7, flat2);
 }
 
-// Note:
-//  access p[7-4], p[0], and q[7-4], q[0]
-static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p,
-                                     const __m128i *q, __m128i *flat, int bd) {
-  flat_mask_internal(th, p, q, bd, 4, 8, flat);
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+                                               const __m128i *q, __m128i *flat,
+                                               __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
 }
 
-static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
-                                  const __m128i *th, int bd, __m128i *ps,
-                                  __m128i *qs) {
-  __m128i t80;
-  if (bd == 8)
-    t80 = _mm_set1_epi16(0x80);
-  else if (bd == 10)
-    t80 = _mm_set1_epi16(0x200);
-  else  // bd == 12
-    t80 = _mm_set1_epi16(0x800);
-
-  __m128i ps0 = _mm_subs_epi16(p[0], t80);
-  __m128i ps1 = _mm_subs_epi16(p[1], t80);
-  __m128i qs0 = _mm_subs_epi16(q[0], t80);
-  __m128i qs1 = _mm_subs_epi16(q[1], t80);
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *hev, __m128i *mask,
+                                                 __m128i *qs1qs0,
+                                                 __m128i *ps1ps0, __m128i *t80,
+                                                 int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+  __m128i ps1ps0_work, qs1qs0_work, work;
+  __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &work);
+  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm_and_si128(filt, *mask);
+  filt = _mm_unpacklo_epi64(filt, filt);
+
+  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+  pixel_clamp(&pmin, &pmax, &filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
 
+  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filt, one);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(*hev, filt);
+
+  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+                                            __m128i *qs, const __m128i *mask,
+                                            const __m128i *th, int bd,
+                                            __m128i *t80) {
+  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
 
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
   __m128i filter = _mm_subs_epi16(ps1, qs1);
   pixel_clamp(&pmin, &pmax, &filter);
 
+  // hev_filter
   __m128i hev;
-  highbd_hev_mask(p, q, th, &hev);
+  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm_subs_epu16(h, *th);
+  const __m128i ffff = _mm_cmpeq_epi16(h, h);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
   filter = _mm_and_si128(filter, hev);
 
   const __m128i x = _mm_subs_epi16(qs0, ps0);
@@ -172,145 +298,332 @@ static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
   filter = _mm_adds_epi16(filter, x);
   pixel_clamp(&pmin, &pmax, &filter);
   filter = _mm_and_si128(filter, *mask);
-
   const __m128i t3 = _mm_set1_epi16(3);
   const __m128i t4 = _mm_set1_epi16(4);
-
   __m128i filter1 = _mm_adds_epi16(filter, t4);
   __m128i filter2 = _mm_adds_epi16(filter, t3);
   pixel_clamp(&pmin, &pmax, &filter1);
   pixel_clamp(&pmin, &pmax, &filter2);
   filter1 = _mm_srai_epi16(filter1, 3);
   filter2 = _mm_srai_epi16(filter2, 3);
-
   qs0 = _mm_subs_epi16(qs0, filter1);
   pixel_clamp(&pmin, &pmax, &qs0);
   ps0 = _mm_adds_epi16(ps0, filter2);
   pixel_clamp(&pmin, &pmax, &ps0);
-
-  qs[0] = _mm_adds_epi16(qs0, t80);
-  ps[0] = _mm_adds_epi16(ps0, t80);
-
+  qs[0] = _mm_adds_epi16(qs0, *t80);
+  ps[0] = _mm_adds_epi16(ps0, *t80);
   filter = _mm_adds_epi16(filter1, one);
   filter = _mm_srai_epi16(filter, 1);
   filter = _mm_andnot_si128(hev, filter);
-
   qs1 = _mm_subs_epi16(qs1, filter);
   pixel_clamp(&pmin, &pmax, &qs1);
   ps1 = _mm_adds_epi16(ps1, filter);
   pixel_clamp(&pmin, &pmax, &ps1);
-
-  qs[1] = _mm_adds_epi16(qs1, t80);
-  ps[1] = _mm_adds_epi16(ps1, t80);
+  qs[1] = _mm_adds_epi16(qs1, *t80);
+  ps[1] = _mm_adds_epi16(ps1, *t80);
 }
 
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput;
-
-static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
-                                                   const uint8_t *blt,
-                                                   const uint8_t *lt,
-                                                   const uint8_t *thr, int bd,
-                                                   PixelOutput pixel_output) {
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+    const unsigned char *lt, const unsigned char *thr, int bd) {
+  int i;
   __m128i blimit, limit, thresh;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
+  __m128i t80;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+  for (i = 0; i < 7; i++) {
+    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+  }
+  __m128i mask, hevhev;
+  __m128i p1p0, q1q0, abs_p1p0;
 
-  __m128i p[8], q[8];
-  load_highbd_pixel(s, 8, pitch, p, q);
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hevhev, &mask);
 
-  __m128i mask;
-  highbd_filter_mask(p, q, &limit, &blimit, &mask);
+  __m128i ps0ps1, qs0qs1;
+  // filter4
+  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
 
   __m128i flat, flat2;
-  const __m128i one = _mm_set1_epi16(1);
-  highbd_flat_mask4(&one, p, q, &flat, bd);
-  highbd_flat_mask5(&one, p, q, &flat2, bd);
+  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
 
   flat = _mm_and_si128(flat, mask);
   flat2 = _mm_and_si128(flat2, flat);
 
-  __m128i ps[2], qs[2];
-  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+  flat2 = _mm_unpacklo_epi64(flat2, flat2);
 
   // flat and wide flat calculations
-  __m128i flat_p[3], flat_q[3];
-  __m128i flat2_p[7], flat2_q[7];
+  __m128i flat_p[3], flat_q[3], flat_pq[3];
+  __m128i flat2_p[6], flat2_q[6];
+  __m128i flat2_pq[6];
   {
+    __m128i work0;
     const __m128i eight = _mm_set1_epi16(8);
     const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0]));
+    flat2_q[0] =
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0]));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+    __m128i sum_p6, sum_p3;
+    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+    sum_q = _mm_sub_epi16(sum_p, p[5]);
+    sum_p = _mm_sub_epi16(sum_p, q[5]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+    flat2_p[1] = _mm_add_epi16(sum_p, work0);
+    flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+
+    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+    work0 = _mm_add_epi16(sum_p3, pq[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+    flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+    flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+    sum_p = _mm_sub_epi16(sum_p, q[4]);
+    sum_q = _mm_sub_epi16(sum_q, p[4]);
 
-    __m128i sum_p =
-        _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3]));
-    __m128i sum_q =
-        _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3]));
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+    flat2_p[2] = _mm_add_epi16(sum_p, work0);
+    flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
 
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+    work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[3]);
+    sum_q = _mm_sub_epi16(sum_q, p[3]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+    flat2_p[3] = _mm_add_epi16(sum_p, work0);
+    flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[2]);
+    sum_q = _mm_sub_epi16(sum_q, p[2]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+    flat2_p[4] = _mm_add_epi16(sum_p, work0);
+    flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+    sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[1]);
+    sum_q = _mm_sub_epi16(sum_q, p[1]);
+
+    work0 = _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+    flat2_p[5] = _mm_add_epi16(sum_p, work0);
+    flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+    flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+  }
+
+  // highbd_filter8
+  pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+  pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+  for (i = 0; i < 3; i++) {
+    pq[i] = _mm_andnot_si128(flat, pq[i]);
+    flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+    pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+  }
+
+  // highbd_filter16
+  for (i = 5; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    pq[i] = _mm_andnot_si128(flat2, pq[i]);
+    flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+    //  get values for when (flat2 && flat && mask)
+    pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blt, const uint8_t *lt,
+                                       const uint8_t *thr, int bd) {
+  __m128i p[7], q[7], pq[7];
+  int i;
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
+  __m128i blimit, limit, thresh, t80;
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
+  __m128i mask;
+  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+  __m128i flat, flat2;
+  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+  __m128i ps[2], qs[2];
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+  // flat and wide flat calculations
+  __m128i flat_p[3], flat_q[3];
+  __m128i flat2_p[6], flat2_q[6];
+  {
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
     __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
     sum_p = _mm_add_epi16(sum_p, sum_lp);
-
     __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
     sum_q = _mm_add_epi16(sum_q, sum_lq);
     sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
     sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat2_p[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4);
-    flat2_q[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4);
+    flat2_p[0] = _mm_srli_epi16(
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                           _mm_add_epi16(p[1], q[0]))),
+        4);
+    flat2_q[0] = _mm_srli_epi16(
+        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                           _mm_add_epi16(p[0], q[1]))),
+        4);
     flat_p[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
     flat_q[0] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
-
-    __m128i sum_p7 = _mm_add_epi16(p[7], p[7]);
-    __m128i sum_q7 = _mm_add_epi16(q[7], q[7]);
+    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
     __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
     __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
-    sum_q = _mm_sub_epi16(sum_p, p[6]);
-    sum_p = _mm_sub_epi16(sum_p, q[6]);
-    flat2_p[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4);
-    flat2_q[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4);
-
+    sum_q = _mm_sub_epi16(sum_p, p[5]);
+    sum_p = _mm_sub_epi16(sum_p, q[5]);
+    flat2_p[1] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+        4);
+    flat2_q[1] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+        4);
     sum_lq = _mm_sub_epi16(sum_lp, p[2]);
     sum_lp = _mm_sub_epi16(sum_lp, q[2]);
     flat_p[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
     flat_q[1] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
-    sum_p7 = _mm_add_epi16(sum_p7, p[7]);
-    sum_q7 = _mm_add_epi16(sum_q7, q[7]);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
     sum_p3 = _mm_add_epi16(sum_p3, p[3]);
     sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-
-    sum_p = _mm_sub_epi16(sum_p, q[5]);
-    sum_q = _mm_sub_epi16(sum_q, p[5]);
-    flat2_p[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4);
-    flat2_q[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4);
-
+    sum_p = _mm_sub_epi16(sum_p, q[4]);
+    sum_q = _mm_sub_epi16(sum_q, p[4]);
+    flat2_p[2] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+        4);
+    flat2_q[2] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+        4);
     sum_lp = _mm_sub_epi16(sum_lp, q[1]);
     sum_lq = _mm_sub_epi16(sum_lq, p[1]);
     flat_p[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
     flat_q[2] =
         _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
-    int i;
-    for (i = 3; i < 7; ++i) {
-      sum_p7 = _mm_add_epi16(sum_p7, p[7]);
-      sum_q7 = _mm_add_epi16(sum_q7, q[7]);
-      sum_p = _mm_sub_epi16(sum_p, q[7 - i]);
-      sum_q = _mm_sub_epi16(sum_q, p[7 - i]);
-      flat2_p[i] =
-          _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4);
-      flat2_q[i] =
-          _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4);
-    }
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[3]);
+    sum_q = _mm_sub_epi16(sum_q, p[3]);
+    flat2_p[3] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+        4);
+    flat2_q[3] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+        4);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[2]);
+    sum_q = _mm_sub_epi16(sum_q, p[2]);
+    flat2_p[4] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+        4);
+    flat2_q[4] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+        4);
+    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+    sum_p = _mm_sub_epi16(sum_p, q[1]);
+    sum_q = _mm_sub_epi16(sum_q, p[1]);
+    flat2_p[5] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_p, _mm_add_epi16(
+                       sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+        4);
+    flat2_q[5] = _mm_srli_epi16(
+        _mm_add_epi16(
+            sum_q, _mm_add_epi16(
+                       sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+        4);
   }
-
   // highbd_filter8
   p[2] = _mm_andnot_si128(flat, p[2]);
   //  p2 remains unchanged if !(flat && mask)
@@ -320,7 +633,6 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
   q[2] = _mm_andnot_si128(flat, q[2]);
   flat_q[2] = _mm_and_si128(flat, flat_q[2]);
   q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-
   int i;
   for (i = 1; i >= 0; i--) {
     ps[i] = _mm_andnot_si128(flat, ps[i]);
@@ -330,675 +642,979 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
     flat_q[i] = _mm_and_si128(flat, flat_q[i]);
     q[i] = _mm_or_si128(qs[i], flat_q[i]);
   }
-
   // highbd_filter16
-
-  if (pixel_output == FOUR_PIXELS) {
-    for (i = 6; i >= 0; i--) {
-      //  p[i] remains unchanged if !(flat2 && flat && mask)
-      p[i] = _mm_andnot_si128(flat2, p[i]);
-      flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-      //  get values for when (flat2 && flat && mask)
-      p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-
-      q[i] = _mm_andnot_si128(flat2, q[i]);
-      flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-      q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]);
-      _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]);
-    }
-  } else {  // EIGHT_PIXELS
-    for (i = 6; i >= 0; i--) {
-      //  p[i] remains unchanged if !(flat2 && flat && mask)
-      p[i] = _mm_andnot_si128(flat2, p[i]);
-      flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-      //  get values for when (flat2 && flat && mask)
-      p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-
-      q[i] = _mm_andnot_si128(flat2, q[i]);
-      flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-      q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-      _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
-    }
+  for (i = 5; i >= 0; i--) {
+    //  p[i] remains unchanged if !(flat2 && flat && mask)
+    p[i] = _mm_andnot_si128(flat2, p[i]);
+    flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+    //  get values for when (flat2 && flat && mask)
+    p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+    q[i] = _mm_andnot_si128(flat2, q[i]);
+    flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+    q[i] = _mm_or_si128(q[i], flat2_q[i]);
   }
 }
 
-// Note:
-//  highbd_lpf_horz_edge_8_8p() output 8 pixels per register
-//  highbd_lpf_horz_edge_8_4p() output 4 pixels per register
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch,
-                                             const uint8_t *blt,
-                                             const uint8_t *lt,
-                                             const uint8_t *thr, int bd) {
-  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS);
-}
-#endif  // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-
-static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch,
-                                             const uint8_t *blt,
-                                             const uint8_t *lt,
-                                             const uint8_t *thr, int bd) {
-  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS);
-}
-
-void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
-                                           const uint8_t *_blimit,
-                                           const uint8_t *_limit,
-                                           const uint8_t *_thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
-#else
-  highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
-#endif
-}
-
-void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
-#else
-  highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd);
-  highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd);
-#endif
-}
-
-static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
-                                      const __m128i *p0, const __m128i *q0,
-                                      const __m128i *q1, const __m128i *q2,
-                                      int p, uint16_t *s) {
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
-#else
-  _mm_store_si128((__m128i *)(s - 3 * p), *p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), *p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), *p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), *q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), *q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), *q2);
-#endif
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+  }
 }
 
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
-  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
   __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i workp_a, workp_b, workp_shft;
+  __m128i pq[3];
+  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+  __m128i flat_p1p0, flat_q0q1;
 
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
   __m128i t80;
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  __m128i ps1, ps0, qs0, qs1;
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
-  if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
-    t80 = _mm_set1_epi16(0x80);
-  } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
-    t80 = _mm_set1_epi16(0x200);
-  } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
-    t80 = _mm_set1_epi16(0x800);
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
   }
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
 
-  ps1 = _mm_subs_epi16(p1, t80);
-  ps0 = _mm_subs_epi16(p0, t80);
-  qs0 = _mm_subs_epi16(q0, t80);
-  qs1 = _mm_subs_epi16(q1, t80);
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  // filter_mask and hev_mask
-  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+}
 
-  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+    const unsigned char *_thresh0, const unsigned char *_blimit1,
+    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat, work;
+  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+  __m128i op1, op0, oq0, oq1;
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p2p1 = abs_diff16(*p2, *p1);
+  abs_p1p0 = abs_diff16(*p1, *p0);
+  abs_q1q0 = abs_diff16(*q1, *q0);
+  abs_q2q1 = abs_diff16(*q2, *q1);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
-  mask = _mm_max_epi16(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  mask = _mm_max_epi16(abs_q1q0, mask);
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
-      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
-      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  mask = _mm_max_epi16(abs_q2q1, mask);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_max_epi16(mask, abs_p2p1);
+  mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+  flat = _mm_max_epi16(flat, work);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    op1 = _mm_srli_epi16(workp_shft0, 3);
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+    op0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    oq1 = _mm_srli_epi16(workp_shft1, 3);
+  }
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  qs[0] = _mm_andnot_si128(flat, qs[0]);
+  oq0 = _mm_and_si128(flat, oq0);
+  *q0 = _mm_or_si128(qs[0], oq0);
+
+  qs[1] = _mm_andnot_si128(flat, qs[1]);
+  oq1 = _mm_and_si128(flat, oq1);
+  *q1 = _mm_or_si128(qs[1], oq1);
+
+  ps[0] = _mm_andnot_si128(flat, ps[0]);
+  op0 = _mm_and_si128(flat, op0);
+  *p0 = _mm_or_si128(ps[0], op0);
+
+  ps[1] = _mm_andnot_si128(flat, ps[1]);
+  op1 = _mm_and_si128(flat, op1);
+  *p1 = _mm_or_si128(ps[1], op1);
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+                             _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    const unsigned char *_blimit, const unsigned char *_limit,
+    const unsigned char *_thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[4];
+  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+  __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+  __m128i abs_p1p0;
+
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
   // flat_mask4
-  flat = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
-      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
-      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
-  flat = _mm_max_epi16(work, flat);
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
   flat = _mm_max_epi16(abs_p1p0, flat);
-  flat = _mm_max_epi16(abs_q1q0, flat);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
 
-  if (bd == 8)
-    flat = _mm_subs_epu16(flat, one);
-  else if (bd == 10)
-    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
-  else  // bd == 12
-    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
 
   flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
 
-  // Added before shift for rounding part of ROUND_POWER_OF_TWO
+  {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  }
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd);
 
-  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
 
-  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
+  work_a = _mm_andnot_si128(flat, *q2);
+  *q2 = _mm_and_si128(flat, oq2);
+  *q2 = _mm_or_si128(work_a, *q2);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
+  work_a = _mm_andnot_si128(flat, *p2);
+  *p2 = _mm_and_si128(flat, op2);
+  *p2 = _mm_or_si128(work_a, *p2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+    const unsigned char *_limit0, const unsigned char *_thresh0,
+    const unsigned char *_blimit1, const unsigned char *_limit1,
+    const unsigned char *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat;
+  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
+
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+  work1 =
+      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
+  work0 = _mm_max_epi16(work0, work1);
+  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+  work2 = _mm_max_epi16(work2, work0);
+  mask = _mm_max_epi16(work2, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
+  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+  flat = _mm_max_epi16(work1, flat);
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+  flat = _mm_max_epi16(work0, flat);
 
-  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  {
+    __m128i workp_a, workp_b;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+  }
 
   // lp filter
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
 
-  filt = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
+  qs[0] = _mm_andnot_si128(flat, qs[0]);
+  oq0 = _mm_and_si128(flat, oq0);
+  *q0 = _mm_or_si128(qs[0], oq0);
 
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, mask);
+  qs[1] = _mm_andnot_si128(flat, qs[1]);
+  oq1 = _mm_and_si128(flat, oq1);
+  *q1 = _mm_or_si128(qs[1], oq1);
 
-  filter1 = _mm_adds_epi16(filt, t4);
-  filter2 = _mm_adds_epi16(filt, t3);
+  ps[0] = _mm_andnot_si128(flat, ps[0]);
+  op0 = _mm_and_si128(flat, op0);
+  *p0 = _mm_or_si128(ps[0], op0);
 
-  // Filter1 >> 3
-  pixel_clamp(&pmin, &pmax, &filter1);
-  filter1 = _mm_srai_epi16(filter1, 3);
+  ps[1] = _mm_andnot_si128(flat, ps[1]);
+  op1 = _mm_and_si128(flat, op1);
+  *p1 = _mm_or_si128(ps[1], op1);
 
-  // Filter2 >> 3
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter2 = _mm_srai_epi16(filter2, 3);
+  work_a = _mm_andnot_si128(flat, *q2);
+  *q2 = _mm_and_si128(flat, oq2);
+  *q2 = _mm_or_si128(work_a, *q2);
 
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(hev, filt);
-
-  work_a = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  q0 = _mm_load_si128((__m128i *)flat_oq0);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q0 = _mm_and_si128(flat, q0);
-  q0 = _mm_or_si128(work_a, q0);
-
-  work_a = _mm_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  q1 = _mm_load_si128((__m128i *)flat_oq1);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q1 = _mm_and_si128(flat, q1);
-  q1 = _mm_or_si128(work_a, q1);
-
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q2 = _mm_load_si128((__m128i *)flat_oq2);
-  work_a = _mm_andnot_si128(flat, work_a);
-  q2 = _mm_and_si128(flat, q2);
-  q2 = _mm_or_si128(work_a, q2);
-
-  work_a = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  p0 = _mm_load_si128((__m128i *)flat_op0);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p0 = _mm_and_si128(flat, p0);
-  p0 = _mm_or_si128(work_a, p0);
-
-  work_a = _mm_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &work_a);
-  work_a = _mm_adds_epi16(work_a, t80);
-  p1 = _mm_load_si128((__m128i *)flat_op1);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p1 = _mm_and_si128(flat, p1);
-  p1 = _mm_or_si128(work_a, p1);
-
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p2 = _mm_load_si128((__m128i *)flat_op2);
-  work_a = _mm_andnot_si128(flat, work_a);
-  p2 = _mm_and_si128(flat, p2);
-  p2 = _mm_or_si128(work_a, p2);
-
-  store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s);
+  work_a = _mm_andnot_si128(flat, *p2);
+  *p2 = _mm_and_si128(flat, op2);
+  *p2 = _mm_or_si128(work_a, *p2);
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+                             &p1p0, _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
 }
 
 void aom_highbd_lpf_horizontal_8_dual_sse2(
     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     const uint8_t *_thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+                                  _blimit0, _limit0, _thresh0, _blimit1,
+                                  _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+    const uint8_t *_thresh, int bd) {
   __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-#endif
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-#endif
-  const __m128i abs_p1p0 =
-      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
-  const __m128i abs_q1q0 =
-      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
+  __m128i mask, hev;
+  __m128i p1p0, q1q0;
+  __m128i pq[2];
+
+  __m128i abs_p1p0;
+
+  __m128i t80;
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i mask, flat;
+  __m128i p[2], q[2];
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
   const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
   const __m128i one = _mm_set1_epi16(1);
-  __m128i abs_p0q0 =
-      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
-  __m128i abs_p1q1 =
-      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
 
-  const __m128i t4 = _mm_set1_epi16(4);
-  const __m128i t3 = _mm_set1_epi16(3);
   __m128i t80;
-  __m128i tff80;
-  __m128i tffe0;
-  __m128i t1f;
-  // equivalent to shifting 0x1f left by bitdepth - 8
-  // and setting new bits to 1
-  const __m128i t1 = _mm_set1_epi16(0x1);
-  __m128i t7f;
-  // equivalent to shifting 0x7f left by bitdepth - 8
-  // and setting new bits to 1
-  __m128i ps1, ps0, qs0, qs1;
-  __m128i filt;
-  __m128i work_a;
-  __m128i filter1, filter2;
-
-  if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
-    t80 = _mm_set1_epi16(0x80);
-    tff80 = _mm_set1_epi16(0xff80);
-    tffe0 = _mm_set1_epi16(0xffe0);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
-  } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
-    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
-  } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
-    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
-    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
-    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
-  }
 
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
 
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
   mask = _mm_max_epi16(flat, mask);
 
-#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4)
-  __m128i work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
-      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
-  mask = _mm_max_epi16(work, mask);
-  work = _mm_max_epi16(
-      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
-      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
-  mask = _mm_max_epi16(work, mask);
-#endif
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit0);
   mask = _mm_cmpeq_epi16(mask, zero);
 
-  // filter4
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-  const __m128i pmin = _mm_subs_epi16(zero, t80);
-
-  filt = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, hev);
-  work_a = _mm_subs_epi16(qs0, ps0);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  filt = _mm_adds_epi16(filt, work_a);
-  pixel_clamp(&pmin, &pmax, &filt);
-
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  filt = _mm_and_si128(filt, mask);
-
-  filter1 = _mm_adds_epi16(filt, t4);
-  pixel_clamp(&pmin, &pmax, &filter1);
-
-  filter2 = _mm_adds_epi16(filt, t3);
-  pixel_clamp(&pmin, &pmax, &filter2);
-
-  // Filter1 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
-  filter1 = _mm_srli_epi16(filter1, 3);
-  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
-  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
-  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
+  p[0] = *p0;
+  p[1] = *p1;
+  q[0] = *q0;
+  q[1] = *q1;
 
-  // Filter2 >> 3
-  work_a = _mm_cmpgt_epi16(zero, filter2);
-  filter2 = _mm_srli_epi16(filter2, 3);
-  work_a = _mm_and_si128(work_a, tffe0);
-  filter2 = _mm_and_si128(filter2, t1f);
-  filter2 = _mm_or_si128(filter2, work_a);
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
 
-  // filt >> 1
-  filt = _mm_adds_epi16(filter1, t1);
-  work_a = _mm_cmpgt_epi16(zero, filt);
-  filt = _mm_srli_epi16(filt, 1);
-  work_a = _mm_and_si128(work_a, tff80);
-  filt = _mm_and_si128(filt, t7f);
-  filt = _mm_or_si128(filt, work_a);
-
-  filt = _mm_andnot_si128(hev, filt);
-
-  q0 = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &q0);
-  q0 = _mm_adds_epi16(q0, t80);
-
-  q1 = _mm_subs_epi16(qs1, filt);
-  pixel_clamp(&pmin, &pmax, &q1);
-  q1 = _mm_adds_epi16(q1, t80);
-
-  p0 = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &p0);
-  p0 = _mm_adds_epi16(p0, t80);
-
-  p1 = _mm_adds_epi16(ps1, filt);
-  pixel_clamp(&pmin, &pmax, &p1);
-  p1 = _mm_adds_epi16(p1, t80);
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-#else
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-#endif
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p1p0, q1q0;
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+                             _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
 }
 
 void aom_highbd_lpf_horizontal_4_dual_sse2(
     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     const uint8_t *_thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i ps[2], qs[2];
+
+  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
 }
 
 void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
-  uint16_t *src[1];
-  uint16_t *dst[1];
+  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+  __m128i p1p0, q1q0;
+  __m128i p1, q1;
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+                             thresh, bd);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  // transpose from 8x4 to 4x8
+  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
 
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
 }
 
 void aom_highbd_lpf_vertical_4_dual_sse2(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i ps[2], qs[2];
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                               &d2, &d3);
+
+  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+                                  thresh0, blimit1, limit1, thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+                               &d3, &d4, &d5, &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x3, x2, x1, x0, p0, q0;
+  __m128i p1p0, q1q0;
+
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+                             limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0, p1, q1, p2, q2;
+
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+                           &p0, &q0, &q1, &q2, &d6, &d7);
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
 }
 
 void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
-  uint16_t *src[1];
-  uint16_t *dst[1];
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p2, p1, p0, p3, q0;
+  __m128i q1q0, p1p0;
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
 
   // Loop filtering
-  aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
+  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+                             &p1p0, blimit, limit, thresh, bd);
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+                               &d1, &d2, &d3);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
 }
 
 void aom_highbd_lpf_vertical_8_dual_sse2(
     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                           &d2, &d3, &d4, &d5, &d6, &d7);
+
+  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+                                  blimit0, limit0, thresh0, blimit1, limit1,
+                                  thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+                           &x2, &x3, &x4, &x5, &x6, &x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
 
-  // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  __m128i q[7], p[7], pq[7];
+  __m128i p6, p5, p4, p3;
+  __m128i p6_2, p5_2, p4_2, p3_2;
+  __m128i d0, d1, d2, d3;
+  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
-                                        thresh0, blimit1, limit1, thresh1, bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+                               &p[3], &p[2], &p[1], &p[0]);
 
-  // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
-}
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 
-void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
-  uint16_t *src[2];
-  uint16_t *dst[2];
+  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+                               &q[3], &q[4], &q[5], &q[6], &d7_2);
 
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 
-  // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
 
-  // Loop filtering
-  aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
-                                        bd);
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
-
-  // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
-}
-
-void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int bd) {
-  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
-
-  //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
-
-#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
-  highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd);
-#else
-  aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, bd);
-#endif
-  //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  q[0] = _mm_srli_si128(pq[0], 8);
+  q[1] = _mm_srli_si128(pq[1], 8);
+  q[2] = _mm_srli_si128(pq[2], 8);
+  q[3] = _mm_srli_si128(pq[3], 8);
+  q[4] = _mm_srli_si128(pq[4], 8);
+  q[5] = _mm_srli_si128(pq[5], 8);
+
+  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
index 2bbf15ef2..dea113a29 100644
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
index 855bc6558..e0d22522d 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -288,11 +288,9 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SADNXN4D  4, 16
 HIGH_SADNXN4D 16,  4
 HIGH_SADNXN4D  8, 32
 HIGH_SADNXN4D 32,  8
 HIGH_SADNXN4D 16, 64
 HIGH_SADNXN4D 64, 16
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
index 760e68aab..3398d8a2a 100644
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -158,10 +158,8 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -230,10 +228,8 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
 HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -302,12 +298,10 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
 HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
 HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
-%endif
 
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -376,7 +370,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index ee19796e3..61f5b8e86 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -94,7 +94,7 @@ SECTION .text
 %define filter_idx_shift 5
 
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
@@ -102,19 +102,20 @@ SECTION .text
                                       sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse, \
+                                        g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -133,8 +134,9 @@ SECTION .text
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                x_offset, y_offset, dst, dst_stride, height, \
-                                sse, g_bilin_filter, g_pw_8
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse, \
+                                    g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
@@ -153,22 +155,16 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                              x_offset, y_offset, dst, dst_stride, height, sse
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
       %define block_height heightd
     %endif
 
@@ -287,14 +283,14 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -311,7 +307,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -514,14 +510,14 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -538,7 +534,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -636,14 +632,14 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -660,7 +656,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -735,14 +731,14 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -759,7 +755,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -862,8 +858,8 @@ SECTION .text
 
 .x_nonhalf_y_nonhalf:
 ; loading filter - this is same as in 8-bit depth
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
   shl           y_offsetd, filter_idx_shift
@@ -872,7 +868,7 @@ SECTION .text
   mova                 m9, [bilin_filter+x_offsetq+16]
   mova                m10, [bilin_filter+y_offsetq]
   mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -900,7 +896,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 ; end of load filter
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
index befd81269..18eb03d12 100644
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -13,8 +13,8 @@
 #include <emmintrin.h>
 #include <stddef.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
                                     const uint16_t *src, ptrdiff_t src_stride,
@@ -204,21 +204,15 @@ SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
 SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
 SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
 SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
-#if CONFIG_EXT_PARTITION
 SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
 SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
 SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
-#endif
 SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
 SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
 SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
 SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
 SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
 SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
-#if CONFIG_EXT_PARTITION
-SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); }
-SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); }
-#endif
 
 static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
   if (rows == 4) {
@@ -244,25 +238,17 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
     if (cols == 16) return subtract_16x32;
     if (cols == 32) return subtract_32x32;
     if (cols == 64) return subtract_64x32;
-#if CONFIG_EXT_PARTITION
-    if (cols == 128) return subtract_128x32;
-#endif  // CONFIG_EXT_PARTITION
   }
   if (rows == 64) {
     if (cols == 16) return subtract_16x64;
     if (cols == 32) return subtract_32x64;
     if (cols == 64) return subtract_64x64;
-#if CONFIG_EXT_PARTITION
     if (cols == 128) return subtract_128x64;
-#endif  // CONFIG_EXT_PARTITION
   }
-#if CONFIG_EXT_PARTITION
   if (rows == 128) {
-    if (cols == 32) return subtract_32x128;
     if (cols == 64) return subtract_64x128;
     if (cols == 128) return subtract_128x128;
   }
-#endif  // CONFIG_EXT_PARTITION
   assert(0);
   return NULL;
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
index cf8ea498c..0d954e178 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -14,6 +14,8 @@
 
 %include "aom_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;unsigned int aom_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index 62acf3ed3..fdfadc886 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -12,13 +12,17 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -185,13 +189,11 @@ VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
-#if CONFIG_EXT_PARTITION_TYPES
 VAR_FN(16, 4, 16, 6);
 VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 16, 8);
+VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
 VAR_FN(64, 16, 16, 10);
-#endif
 
 #undef VAR_FN
 
@@ -398,7 +400,6 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
 #define FNS(opt)                        \
   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
@@ -416,20 +417,6 @@ DECLS(sse2);
   FN(32, 8, 16, 5, 3, opt, (int64_t));  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)); \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
-#else
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t))
-#endif
 
 FNS(sse2);
 
@@ -577,7 +564,6 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
 #define FNS(opt)                        \
   FN(64, 64, 16, 6, 6, opt, (int64_t)); \
   FN(64, 32, 16, 6, 5, opt, (int64_t)); \
@@ -595,30 +581,104 @@ DECLS(sse2);
   FN(32, 8, 16, 5, 3, opt, (int64_t));  \
   FN(16, 64, 16, 4, 6, opt, (int64_t)); \
   FN(64, 16, 16, 6, 4, opt, (int64_t));
-#else
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
-#endif
 
 FNS(sse2);
 
 #undef FNS
 #undef FN
 
-void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint16_t *comp_pred, int width, int height,
                                     int subpel_x_q3, int subpel_y_q3,
                                     const uint8_t *ref8, int ref_stride,
                                     int bd) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     if (width >= 8) {
@@ -648,54 +708,48 @@ void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
         ref += 2 * ref_stride;
       }
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                               width, kernel, 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
+                              width, NULL, -1, kernel, 16, width, height, bd);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      aom_highbd_convolve8_horiz(ref8, ref_stride,
-                                 CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
-                                 16, NULL, -1, width, height, bd);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
-                                width, NULL, -1, kernel, 16, width, height, bd);
-    } else {
-      DECLARE_ALIGNED(16, uint16_t,
-                      temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
-                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                 intermediate_height, bd);
-      aom_highbd_convolve8_vert(
-          CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
-          MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
-          16, width, height, bd);
-    }
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
+        MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
+        16, width, height, bd);
   }
 }
 
-void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
-                                             const uint8_t *pred8, int width,
-                                             int height, int subpel_x_q3,
-                                             int subpel_y_q3,
-                                             const uint8_t *ref8,
-                                             int ref_stride, int bd) {
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
   int i;
-  aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
-                            ref8, ref_stride, bd);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
   /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
   assert(!(width * height & 7));
   n = width * height >> 3;
@@ -707,3 +761,102 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
     pred += 8;
   }
 }
+
+static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                               const __m128i *w0,
+                                               const __m128i *w1,
+                                               const __m128i *r,
+                                               void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred,
+                                       const uint8_t *pred8, int width,
+                                       int height, const uint8_t *ref8,
+                                       int ref_stride,
+                                       const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // Read 8 pixels one row at a time
+    assert(!(width & 7));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 8) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+        comp_pred += 8;
+        pred += 8;
+        ref += 8;
+      }
+      ref += ref_stride - width;
+    }
+  } else {
+    // Read 4 pixels two rows at a time
+    assert(!(width & 3));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+      comp_pred += 8;
+      pred += 8;
+      ref += 2 * ref_stride;
+    }
+  }
+}
+
+void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+    comp_pred += 8;
+    pred += 8;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
index cc7f52811..6c247a91b 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -11,8 +11,8 @@
 
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/variance.h"
 #include "aom_dsp/aom_filter.h"
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
index 6b8922b8c..1e67d392e 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -11,7 +11,20 @@
 
 #include <immintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y0 = _mm256_sad_epu8(x0, zero);
+  __m256i y1 = _mm256_sad_epu8(x1, zero);
+  y0 = _mm256_add_epi64(y0, y1);
+  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+  y0 = _mm256_add_epi64(u0, y0);
+  u0 = _mm256_unpackhi_epi64(y0, y0);
+  return _mm256_add_epi16(y0, u0);
+}
 
 static INLINE __m256i dc_sum_32(const uint8_t *ref) {
   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
@@ -25,13 +38,31 @@ static INLINE __m256i dc_sum_32(const uint8_t *ref) {
 
 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
                                   ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
+  for (int i = 0; i < height; ++i) {
     _mm256_storeu_si256((__m256i *)dst, *r);
     dst += stride;
   }
 }
 
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m256i sum_above = dc_sum_32(above);
@@ -168,11 +199,58 @@ void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   uint32_t sum = _mm_cvtsi128_si32(left_sum);
   sum += 24;
   sum /= 48;
-
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 64;
+  sum /= 128;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 40;
+  sum /= 80;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -187,6 +265,62 @@ void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
@@ -202,6 +336,63 @@ void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -211,6 +402,42 @@ void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
@@ -218,8 +445,39 @@ void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
-// TM_PRED
+// PAETH_PRED
 
 // Return 16 16-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
@@ -336,6 +594,26 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 32 8-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
                                       const __m256i *top1,
@@ -411,3 +689,123 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
     rep = _mm256_add_epi16(rep, one);
   }
 }
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  const __m256i l = get_left_vector(left);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
index 2a83b9001..5b2452c8e 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -11,11 +11,11 @@
 
 #include <emmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
 
-static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < 4; ++i) {
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  for (int i = 0; i < height; i += 2) {
     *(uint32_t *)dst = dc;
     dst += stride;
     *(uint32_t *)dst = dc;
@@ -51,6 +51,17 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
   }
 }
 
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
@@ -83,6 +94,34 @@ static INLINE __m128i dc_sum_32(const uint8_t *ref) {
   return _mm_add_epi16(x0, high);
 }
 
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x2 = _mm_sad_epu8(x2, zero);
+  x3 = _mm_sad_epu8(x3, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  x2 = _mm_add_epi16(x2, x3);
+  x0 = _mm_add_epi16(x0, x2);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> DC_SHIFT2;
+}
+
 // -----------------------------------------------------------------------------
 // DC_PRED
 
@@ -94,11 +133,26 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 6;
-  sum /= 12;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -109,7 +163,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 6;
-  sum /= 12;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
 
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_8xh(&row, 4, dst, stride);
@@ -123,11 +177,37 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 12;
-  sum /= 24;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_8(left);
@@ -136,7 +216,7 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 12;
-  sum /= 24;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_16xh(&row, 8, dst, stride);
 }
@@ -149,11 +229,37 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 24;
-  sum /= 48;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_64(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_8(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_32(above);
@@ -162,11 +268,63 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
   sum += 24;
-  sum /= 48;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
   const __m128i row = _mm_set1_epi8((uint8_t)sum);
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 64;
+  sum /= 128;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_32(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_16(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_TOP
 
@@ -181,7 +339,21 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   sum_above = _mm_packus_epi16(sum_above, sum_above);
 
   const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -208,6 +380,31 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
@@ -235,6 +432,33 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -249,6 +473,62 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_LEFT
 
@@ -263,7 +543,22 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   sum_left = _mm_packus_epi16(sum_left, sum_left);
 
   const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -291,6 +586,33 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -319,6 +641,34 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
@@ -333,6 +683,62 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_128
 
@@ -341,7 +747,15 @@ void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   (void)above;
   (void)left;
   const uint32_t pred = 0x80808080;
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -360,6 +774,22 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)above;
@@ -377,6 +807,23 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -386,6 +833,42 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // V_PRED
 
@@ -393,7 +876,14 @@ void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const uint32_t pred = *(uint32_t *)above;
   (void)left;
-  dc_store_4x8(pred, dst, stride);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 16, dst, stride);
 }
 
 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
@@ -410,6 +900,20 @@ void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_8xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const __m128i row = _mm_load_si128((__m128i const *)above);
@@ -424,19 +928,75 @@ void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
   dc_store_16xh(&row, 32, dst, stride);
 }
 
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
   const __m128i row0 = _mm_load_si128((__m128i const *)above);
   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
   (void)left;
-  int i;
-  for (i = 0; i < 16; ++i) {
+  v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  for (int i = 0; i < height; ++i) {
     _mm_store_si128((__m128i *)dst, row0);
     _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
     dst += stride;
   }
 }
 
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 16);
+}
+
 // -----------------------------------------------------------------------------
 // H_PRED
 
@@ -471,25 +1031,7 @@ void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
   *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
 }
 
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)above;
   const __m128i left_col = _mm_load_si128((__m128i const *)left);
@@ -500,13 +1042,13 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
@@ -514,26 +1056,26 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   row0 = _mm_shufflelo_epi16(left_col_high, 0);
   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
   dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
   dst += stride;
 
   left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
@@ -541,6 +1083,24 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
   _mm_storel_epi64((__m128i *)dst, row0);
   dst += stride;
   _mm_storel_epi64((__m128i *)dst, row1);
@@ -550,6 +1110,82 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
   _mm_storel_epi64((__m128i *)dst, row3);
 }
 
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int count) {
+  (void)above;
+  for (int i = 0; i < count; ++i) {
+    const __m128i left_col = _mm_load_si128((__m128i const *)left);
+    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+    row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+    left += 16;
+  }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
                                      ptrdiff_t stride) {
   int i;
@@ -601,6 +1237,14 @@ static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
   h_pred_store_16xh(row, 4, dst, stride);
 }
 
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   (void)above;
@@ -611,29 +1255,38 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
   h_prediction_16x8_2(&left_col_8p, dst, stride);
 }
 
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int count) {
   int i = 0;
-
   do {
-    left_col = _mm_load_si128((const __m128i *)left);
-    left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p, dst, stride);
+    const __m128i left_col = _mm_load_si128((const __m128i *)left);
+    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
     dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p, dst, stride);
+    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
     dst += stride << 2;
 
-    left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p, dst, stride);
+    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
     dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p, dst, stride);
+    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
     dst += stride << 2;
 
     left += 16;
     i++;
-  } while (i < 2);
+  } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 4);
 }
 
 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
@@ -664,6 +1317,19 @@ static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
   h_pred_store_32xh(row, 4, dst, stride);
 }
 
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   __m128i left_col, left_col_8p;
@@ -682,3 +1348,83 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
   dst += stride << 2;
   h_prediction_32x8_2(&left_col_8p, dst, stride);
 }
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + 32), r0);
+    _mm_store_si128((__m128i *)(dst + 48), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
index 85b82744e..807ed1770 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -11,11 +11,12 @@
 
 #include <tmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/intrapred_common.h"
 
 // -----------------------------------------------------------------------------
-// TM_PRED
+// PAETH_PRED
 
 // Return 8 16-bit pixels in one row
 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
@@ -82,6 +83,26 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -145,6 +166,28 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 16 8-bit pixels in one row
 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
                                       const __m128i *top1,
@@ -154,6 +197,27 @@ static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
   return _mm_packus_epi16(p0, p1);
 }
 
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m128i l = _mm_loadl_epi64((const __m128i *)left);
@@ -234,6 +298,57 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  __m128i l16;
+
+  for (int i = 0; i < 8; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
@@ -307,6 +422,162 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r32l);
+      _mm_store_si128((__m128i *)(dst + 16), r32h);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  const __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
 // -----------------------------------------------------------------------------
 // SMOOTH_PRED
 
@@ -315,9 +586,15 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 // pixels[2]: right_pred vector
 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
   pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-  pixels[1] = _mm_loadl_epi64((const __m128i *)left);
 
   const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
@@ -325,45 +602,52 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
   pixels[0] = _mm_unpacklo_epi16(d, bp);
 }
 
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vecotr
-// weights[2]: weights_w and scale - weights_w interleave vector
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weights) {
-  __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+                                  __m128i *weight_h, __m128i *weight_w) {
   const __m128i zero = _mm_setzero_si128();
-
-  weights[0] = _mm_unpacklo_epi8(t, zero);
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  weights[1] = _mm_sub_epi16(d, weights[0]);
-  weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+  weight_h[0] = _mm_unpacklo_epi8(t, zero);
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
 
   if (height == 8) {
-    t = _mm_srli_si128(t, 4);
-    weights[0] = _mm_unpacklo_epi8(t, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
   }
 }
 
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight,
-                                   int h, uint8_t *dst, ptrdiff_t stride) {
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
   const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
-  int i;
-  for (i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
 
     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
     b = _mm_unpacklo_epi16(b, pixel[2]);
-    __m128i sum = _mm_madd_epi16(b, weight[2]);
+    __m128i sum = _mm_madd_epi16(b, ww[0]);
 
     sum = _mm_add_epi32(s, sum);
     sum = _mm_add_epi32(sum, round);
@@ -383,10 +667,10 @@ void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   __m128i pixels[3];
   load_pixel_w4(above, left, 4, pixels);
 
-  __m128i weights[3];
-  load_weight_w4(sm_weight_arrays, 4, weights);
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
 
-  smooth_pred_4xh(pixels, weights, 4, dst, stride);
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
 }
 
 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
@@ -394,33 +678,68 @@ void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   __m128i pixels[3];
   load_pixel_w4(above, left, 8, pixels);
 
-  __m128i weights[3];
-  load_weight_w4(sm_weight_arrays, 8, weights);
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
 
-  smooth_pred_4xh(pixels, weights, 8, dst, stride);
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
 // pixels[0]: above and below_pred interleave vector, first half
 // pixels[1]: above and below_pred interleave vector, second half
 // pixels[2]: left vector
 // pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
                                  int height, __m128i *pixels) {
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-  pixels[2] = _mm_load_si128((const __m128i *)left);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
   const __m128i zero = _mm_setzero_si128();
-
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
   d = _mm_unpacklo_epi8(d, zero);
   pixels[0] = _mm_unpacklo_epi16(d, bp);
   pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[7] = pixels[3];
+  }
 }
 
 // weight_h[0]: weight_h vector
 // weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
@@ -429,7 +748,6 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
   const int we_offset = height < 8 ? 4 : 8;
   __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
   weight_h[0] = _mm_unpacklo_epi8(we, zero);
-
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
 
@@ -450,6 +768,19 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
     weight_h[2] = _mm_unpackhi_epi8(we, zero);
     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
   }
 }
 
@@ -531,355 +862,831 @@ void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
 }
 
-// pixels[0]: above and below_pred interleave vector, 1/4
-// pixels[1]: above and below_pred interleave vector, 2/4
-// pixels[2]: above and below_pred interleave vector, 3/4
-// pixels[3]: above and below_pred interleave vector, 3/4
-// pixels[4]: left vector
-// pixels[5]: left vector, h = 32 only
-// pixels[6]: right_pred vector
-static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab = _mm_load_si128((const __m128i *)above);
-  pixels[6] = _mm_set1_epi16((uint16_t)above[15]);
-  pixels[4] = _mm_load_si128((const __m128i *)left);
-  pixels[5] = _mm_load_si128((const __m128i *)(left + 16));
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  __m128i wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i top_right =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
+    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
+      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
+
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      const __m128i scale_m_weights_x =
+          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
+      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
+      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
+      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
+      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
+
+      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
+      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
+
+      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
+      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
 
-  __m128i x = _mm_unpacklo_epi8(ab, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
 
-  x = _mm_unpackhi_epi8(ab, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
 }
 
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// ... ...
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-// ... ...
-static INLINE void load_weight_w16(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
 
-  if (height == 8) {
-    weight_h[0] = _mm_unpacklo_epi8(w8, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);  // scale - weight_h
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
 
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// pixels[0]: above and below_pred interleave vector
+static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weights[0]: weights_h vector
+// weights[1]: scale - weights_h vector
+static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height == 4) {
+    const __m128i weight =
+        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(d, weights[2]);
   }
+}
 
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i d = _mm_set1_epi16(0x100);
 
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+    d = _mm_add_epi16(d, inc);
   }
+}
 
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 4, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 8, &pixels);
 
-    __m128i x = _mm_unpacklo_epi8(w16, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
-    x = _mm_unpackhi_epi8(w16, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 8, weights);
 
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  load_weight_v_w4(sm_weight_arrays, 16, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_h) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height < 16) {
+    const int offset = height < 8 ? 4 : 8;
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
   }
 }
 
-static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
-  const __m128i one = _mm_set1_epi16(1);
+static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                     int h, uint8_t *dst, ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[4] : pixels[5];
+  __m128i d = _mm_set1_epi16(0x100);
 
-  int i;
-  for (i = 0; i < 8; ++i) {
+  for (int i = 0; i < h; ++i) {
     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-    __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc);
-    __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc);
 
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[6]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-    __m128i sum2 = _mm_madd_epi16(b, ww[2]);
-    __m128i sum3 = _mm_madd_epi16(b, ww[3]);
+    s0 = _mm_add_epi32(s0, pred_round);
+    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
 
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    s2 = _mm_add_epi32(s2, sum2);
-    s2 = _mm_add_epi32(s2, round);
-    s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale);
-
-    s3 = _mm_add_epi32(s3, sum3);
-    s3 = _mm_add_epi32(s3, round);
-    s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    sum1 = _mm_packus_epi16(s2, s3);
-    sum1 = _mm_shuffle_epi8(sum1, gat);
-
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    _mm_storel_epi64((__m128i *)(dst + 8), sum1);
+    s1 = _mm_add_epi32(s1, pred_round);
+    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
 
+    __m128i sum01 = _mm_packus_epi16(s0, s1);
+    sum01 = _mm_shuffle_epi8(sum01, gat);
+    _mm_storel_epi64((__m128i *)dst, sum01);
     dst += stride;
-    rep = _mm_add_epi16(rep, one);
+
     d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 8, pixels);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 4, pixels);
 
-  __m128i wh[2], ww[4];
-  load_weight_w16(sm_weight_arrays, 8, wh, ww);
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 4, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
 }
 
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 16, pixels);
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 8, pixels);
 
-  __m128i wh[4], ww[4];
-  load_weight_w16(sm_weight_arrays, 16, wh, ww);
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 8, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 16, pixels);
+
+  __m128i wh[4];
+  load_weight_v_w8(sm_weight_arrays, 16, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
 }
 
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[7];
-  load_pixel_w16(above, left, 32, pixels);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 32, pixels);
 
-  __m128i wh[8], ww[4];
-  load_weight_w16(sm_weight_arrays, 32, wh, ww);
+  __m128i wh[8];
+  load_weight_v_w8(sm_weight_arrays, 32, wh);
 
-  smooth_pred_16x8(pixels, wh, ww, dst, stride, 0);
+  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2);
+  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
   dst += stride << 3;
-  smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3);
+  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
 }
 
-static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left,
-                                  int height, __m128i *pixels) {
-  __m128i ab0 = _mm_load_si128((const __m128i *)above);
-  __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16));
+static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i bottom_left =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round =
+      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i scale_m_weights_y =
+        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      // 8 -> 16
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
+      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
+      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
+      // top_x * weights_y + scale_m_weights_y * bottom_left
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      pred_lo = _mm_add_epi32(pred_lo, round);
+      pred_hi = _mm_add_epi32(pred_hi, round);
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
 
-  pixels[10] = _mm_set1_epi16((uint16_t)above[31]);
-  pixels[8] = _mm_load_si128((const __m128i *)left);
-  pixels[9] = _mm_load_si128((const __m128i *)(left + 16));
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
+}
 
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
+}
 
-  __m128i x = _mm_unpacklo_epi8(ab0, zero);
-  pixels[0] = _mm_unpacklo_epi16(x, bp);
-  pixels[1] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
+}
 
-  x = _mm_unpackhi_epi8(ab0, zero);
-  pixels[2] = _mm_unpacklo_epi16(x, bp);
-  pixels[3] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
+}
 
-  x = _mm_unpacklo_epi8(ab1, zero);
-  pixels[4] = _mm_unpacklo_epi16(x, bp);
-  pixels[5] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
+}
 
-  x = _mm_unpackhi_epi8(ab1, zero);
-  pixels[6] = _mm_unpacklo_epi16(x, bp);
-  pixels[7] = _mm_unpackhi_epi16(x, bp);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
 }
 
-static INLINE void load_weight_w32(const uint8_t *weight_array, int height,
-                                   __m128i *weight_h, __m128i *weight_w) {
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  if (height == 4)
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
+  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+}
+
+// weights[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  (void)height;
+  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
   const __m128i zero = _mm_setzero_si128();
-  __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-  __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]);
-  __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+
+  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
   const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
+  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+}
 
-  if (height == 16) {
-    weight_h[0] = _mm_unpacklo_epi8(w16, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w16, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = _mm_set1_epi16(0x8000);
 
-    __m128i x = _mm_unpacklo_epi8(w32_0, zero);
-    __m128i y = _mm_sub_epi16(d, x);
-    weight_w[0] = _mm_unpacklo_epi16(x, y);
-    weight_w[1] = _mm_unpackhi_epi16(x, y);
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
+    b = _mm_unpacklo_epi16(b, pixel[1]);
+    __m128i sum = _mm_madd_epi16(b, weight[0]);
 
-    x = _mm_unpackhi_epi8(w32_0, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[2] = _mm_unpacklo_epi16(x, y);
-    weight_w[3] = _mm_unpackhi_epi16(x, y);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
 
-    x = _mm_unpacklo_epi8(w32_1, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[4] = _mm_unpacklo_epi16(x, y);
-    weight_w[5] = _mm_unpackhi_epi16(x, y);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
 
-    x = _mm_unpackhi_epi8(w32_1, zero);
-    y = _mm_sub_epi16(d, x);
-    weight_w[6] = _mm_unpacklo_epi16(x, y);
-    weight_w[7] = _mm_unpackhi_epi16(x, y);
+    rep = _mm_add_epi16(rep, one);
   }
+}
 
-  if (height == 32) {
-    weight_h[0] = _mm_unpacklo_epi8(w32_0, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(w32_0, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 4, pixels);
 
-    weight_h[4] = _mm_unpacklo_epi8(w32_1, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(w32_1, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 4, &weights);
 
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-    weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]);
-    weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]);
+  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
+}
 
-    weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]);
-    weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]);
-    weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]);
-    weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 8, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 16, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  dst += stride << 3;
+
+  pixels[0] = _mm_srli_si128(pixels[0], 8);
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+// pixels[2]: left vector + 16
+// pixels[3]: right_pred vector
+static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[3] = pixels[1];
   }
 }
 
-static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh,
-                                    const __m128i *ww, uint8_t *dst,
-                                    ptrdiff_t stride, int quarter) {
-  __m128i d = _mm_set1_epi16(0x100);
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_w) {
+  (void)height;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+}
+
+static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
+                                     int h, uint8_t *dst, ptrdiff_t stride,
+                                     int second_half) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  __m128i rep =
-      (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008);
-  const __m128i left = (quarter < 2) ? pixels[8] : pixels[9];
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
 
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
+    b = _mm_unpacklo_epi16(b, pixels[1]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
 
-    int j;
-    __m128i s[8];
-    __m128i b = _mm_shuffle_epi8(left, rep);
-    b = _mm_unpacklo_epi16(b, pixels[10]);
+    sum0 = _mm_add_epi32(sum0, pred_round);
+    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
 
-    for (j = 0; j < 8; ++j) {
-      s[j] = _mm_madd_epi16(pixels[j], wh_sc);
-      s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j]));
-      s[j] = _mm_add_epi32(s[j], round);
-      s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale);
-    }
+    sum1 = _mm_add_epi32(sum1, pred_round);
+    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
 
-    for (j = 0; j < 8; j += 2) {
-      __m128i sum = _mm_packus_epi16(s[j], s[j + 1]);
-      sum = _mm_shuffle_epi8(sum, gat);
-      _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum);
-    }
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
     dst += stride;
+
     rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
   }
 }
 
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[11];
-  load_pixel_w32(above, left, 16, pixels);
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 4, pixels);
 
-  __m128i wh[4], ww[8];
-  load_weight_w32(sm_weight_arrays, 16, wh, ww);
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 4, ww);
 
-  smooth_pred_32x8(pixels, wh, ww, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
 }
 
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  __m128i pixels[11];
-  load_pixel_w32(above, left, 32, pixels);
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 8, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 8, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 16, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 16, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
+}
 
-  __m128i wh[8], ww[8];
-  load_weight_w32(sm_weight_arrays, 32, wh, ww);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_h_w8(above, left, 32, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 32, ww);
 
-  smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0);
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1);
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2);
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
   dst += stride << 3;
-  smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3);
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i tr_ly =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
+      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
+      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
+      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
+      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
+      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_round);
+      pred_hi = _mm_add_epi32(pred_hi, pred_round);
+
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
 }
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm
deleted file mode 100644
index bc1bb2ff3..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm
+++ /dev/null
@@ -1,410 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pb_1: times 16 db 1
-sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
-sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
-sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
-sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM ssse3
-cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m3, [aboveq]
-  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
-  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
-  pavgb               m3, m2
-
-  ; store 4 lines
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  lea               dstq, [dstq+strideq*2]
-  psrldq              m3, 1
-  psrldq              m4, 1
-  movd    [dstq        ], m3
-  movd    [dstq+strideq], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]               ; l1, l2, l3, l4
-  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
-  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
-  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
-  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
-  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1
-  ; A2 B2 A1 B1
-  ; A3 B3 A2 B2
-  ; A4 B4 A3 B3
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
-  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
-
-  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+stride3q ], m3
-  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq*2], m3
-  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
-  movd  [dstq+strideq  ], m3
-  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
-  movd  [dstq          ], m3
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
-  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
-  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
-  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
-  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
-  psrldq              m4, m0, 1                       ; t1-7 [word]
-  psrldq              m5, m0, 2                       ; t2-7 [word]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1
-  ; A2 B2 A1 B1 C1 D1 E1 F1
-  ; A3 B3 A2 B2 A1 B1 C1 D1
-  ; A4 B4 A3 B3 A2 B2 A1 B1
-  ; A5 B5 A4 B4 A3 B3 A2 B2
-  ; A6 B6 A5 B5 A4 B4 A3 B3
-  ; A7 B7 A6 B6 A5 B5 A4 B4
-  ; A8 B8 A7 B7 A6 B6 A5 B5
-  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
-
-  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
-
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-
-  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
-  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
-  movq  [dstq+strideq*2], m0
-  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
-  movq  [dstq+strideq  ], m0
-  psrldq              m0, 2                     ; A-H1
-  movq  [dstq          ], m0
-  lea               dstq, [dstq+strideq*4]
-  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
-  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
-  movq  [dstq+strideq*2], m6
-  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
-  movq  [dstq+strideq  ], m6
-  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
-  movq  [dstq          ], m6
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                m0, [leftq]
-  movu                m7, [aboveq-1]
-  ; comments below are for a predictor like this
-  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
-  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
-  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
-  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
-  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
-  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
-  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
-  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
-  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
-  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
-  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
-  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
-  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
-  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
-  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
-  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
-  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr             m5, m0, m6, 15
-  palignr             m3, m0, m6, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
-  pavgb               m5, m0                            ; A1 - Ag
-
-  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
-
-  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
-
-  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  palignr             m2, m1, m6, 14
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m1, m6, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m1, m6, 6
-  mova  [dstq          ], m2
-  palignr             m2, m1, m6, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m1, m6, 2
-  mova  [dstq+strideq*2], m2
-  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
-  mova  [dstq+stride3q ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  palignr             m2, m6, m4, 14
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 12
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 10
-  mova  [dstq+strideq*2], m2
-  palignr             m2, m6, m4, 8
-  mova  [dstq+stride3q ], m2
-  lea               dstq, [dstq+strideq*4]
-  palignr             m2, m6, m4, 6
-  mova  [dstq          ], m2
-  palignr             m2, m6, m4, 4
-  mova  [dstq+strideq  ], m2
-  palignr             m2, m6, m4, 2
-  mova  [dstq+strideq*2], m2
-  mova  [dstq+stride3q ], m4
-  RESTORE_GOT
-  RET
-
-INIT_XMM ssse3
-cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-  mova                  m0, [leftq]
-  movu                  m7, [aboveq-1]
-  movu                  m1, [aboveq+15]
-
-  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
-  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
-
-  palignr               m3, m1, m7, 1
-  palignr               m5, m1, m7, 2
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
-
-  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
-  palignr               m5, m0, m7, 15
-  palignr               m3, m0, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
-  pavgb                 m5, m0                            ; A1 - Ag
-  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
-  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
-  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
-
-  DEFINE_ARGS dst, stride, stride3, left, line
-  lea             stride3q, [strideq*3]
-
-  palignr               m5, m2, m1, 14
-  palignr               m7, m1, m6, 14
-  mova  [dstq            ], m7
-  mova  [dstq+16         ], m5
-  palignr               m5, m2, m1, 12
-  palignr               m7, m1, m6, 12
-  mova  [dstq+strideq    ], m7
-  mova  [dstq+strideq+16 ], m5
-  palignr                m5, m2, m1, 10
-  palignr                m7, m1, m6, 10
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m2, m1, 8
-  palignr                m7, m1, m6, 8
-  mova  [dstq+stride3q    ], m7
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m2, m1, 6
-  palignr                m7, m1, m6, 6
-  mova  [dstq             ], m7
-  mova  [dstq+16          ], m5
-  palignr                m5, m2, m1, 4
-  palignr                m7, m1, m6, 4
-  mova  [dstq+strideq     ], m7
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m2, m1, 2
-  palignr                m7, m1, m6, 2
-  mova  [dstq+strideq*2   ], m7
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m6
-  mova  [dstq+stride3q+16 ], m1
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m5, m1, m6, 14
-  palignr                m3, m6, m4, 14
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 12
-  palignr                m3, m6, m4, 12
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 10
-  palignr                m3, m6, m4, 10
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  palignr                m5, m1, m6, 8
-  palignr                m3, m6, m4, 8
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m5
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m5, m1, m6, 6
-  palignr                m3, m6, m4, 6
-  mova  [dstq             ], m3
-  mova  [dstq+16          ], m5
-  palignr                m5, m1, m6, 4
-  palignr                m3, m6, m4, 4
-  mova  [dstq+strideq     ], m3
-  mova  [dstq+strideq+16  ], m5
-  palignr                m5, m1, m6, 2
-  palignr                m3, m6, m4, 2
-  mova  [dstq+strideq*2   ], m3
-  mova  [dstq+strideq*2+16], m5
-  mova  [dstq+stride3q    ], m4
-  mova  [dstq+stride3q+16 ], m6
-  lea               dstq, [dstq+strideq*4]
-
-  mova                   m7, [leftq]
-  mova                   m3, [leftq+16]
-  palignr                m5, m3, m7, 15
-  palignr                m0, m3, m7, 14
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
-  pavgb                  m5, m3                            ; Ah -
-  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
-  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
-  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
-  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
-
-  palignr                m7, m6, m4, 14
-  palignr                m0, m4, m3, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 12
-  palignr                m0, m4, m3, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 10
-  palignr                m0, m4, m3, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m6, m4, 8
-  palignr                m0, m4, m3, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m6, m4, 6
-  palignr                m0, m4, m3, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m6, m4, 4
-  palignr                m0, m4, m3, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m6, m4, 2
-  palignr                m0, m4, m3, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m3
-  mova  [dstq+stride3q+16 ], m4
-  lea                  dstq, [dstq+strideq*4]
-
-  palignr                m7, m4, m3, 14
-  palignr                m0, m3, m2, 14
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 12
-  palignr                m0, m3, m2, 12
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 10
-  palignr                m0, m3, m2, 10
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  palignr                m7, m4, m3, 8
-  palignr                m0, m3, m2, 8
-  mova  [dstq+stride3q    ], m0
-  mova  [dstq+stride3q+16 ], m7
-  lea                  dstq, [dstq+strideq*4]
-  palignr                m7, m4, m3, 6
-  palignr                m0, m3, m2, 6
-  mova  [dstq             ], m0
-  mova  [dstq+16          ], m7
-  palignr                m7, m4, m3, 4
-  palignr                m0, m3, m2, 4
-  mova  [dstq+strideq     ], m0
-  mova  [dstq+strideq+16  ], m7
-  palignr                m7, m4, m3, 2
-  palignr                m0, m3, m2, 2
-  mova  [dstq+strideq*2   ], m0
-  mova  [dstq+strideq*2+16], m7
-  mova  [dstq+stride3q    ], m2
-  mova  [dstq+stride3q+16 ], m3
-
-  RESTORE_GOT
-  RET
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c
deleted file mode 100644
index a9d6a127c..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c
+++ /dev/null
@@ -1,1238 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  __m256i in[16];
-  load_buffer_16x16(input, in);
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-  mm256_transpose_16x16(in, in);
-  av1_idct16_avx2(in);
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) {
-  const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]);
-  const __m256i v0 = _mm256_unpacklo_epi32(u0, u1);
-  const __m256i v1 = _mm256_unpackhi_epi32(u0, u1);
-  in[0] = _mm256_permute4x64_epi64(v0, 0xA8);
-  in[1] = _mm256_permute4x64_epi64(v0, 0xA9);
-  in[2] = _mm256_permute4x64_epi64(v1, 0xA8);
-  in[3] = _mm256_permute4x64_epi64(v1, 0xA9);
-}
-
-#define MM256_SHUFFLE_EPI64(x0, x1, imm8)                        \
-  _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \
-                                        _mm256_castsi256_pd(x1), imm8))
-
-static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) {
-  int i;
-  for (i = 0; i < 16; i += 4) {
-    transpose_col_to_row_nz4x4(&in[i]);
-  }
-
-  for (i = 0; i < 4; ++i) {
-    in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0);
-    in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0);
-  }
-
-  for (i = 0; i < 4; ++i) {
-    in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
-  }
-}
-
-// Coefficients 0-7 before the final butterfly
-static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28);
-  const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04);
-
-  const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i v1 = v0;
-
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  __m256i v5, v6;
-  unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6);
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v1, v5);
-  out[3] = _mm256_add_epi16(v0, v4);
-  out[4] = _mm256_sub_epi16(v0, v4);
-  out[5] = _mm256_sub_epi16(v1, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-// Coefficients 8-15 before the final butterfly
-static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
-  const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
-
-  const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-  const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
-  const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
-
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  __m256i t1, t2, t5, t6;
-  unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  out[0] = _mm256_add_epi16(t0, t3);
-  out[1] = _mm256_add_epi16(t1, t2);
-  out[6] = _mm256_add_epi16(t6, t5);
-  out[7] = _mm256_add_epi16(t7, t4);
-
-  const __m256i v2 = _mm256_sub_epi16(t1, t2);
-  const __m256i v3 = _mm256_sub_epi16(t0, t3);
-  const __m256i v4 = _mm256_sub_epi16(t7, t4);
-  const __m256i v5 = _mm256_sub_epi16(t6, t5);
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
-}
-
-static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-static INLINE void idct16_10(__m256i *in /*in[16]*/) {
-  __m256i out[16];
-  idct16_10_first_half(in, out);
-  idct16_10_second_half(in, &out[8]);
-  add_sub_butterfly(out, in, 16);
-}
-
-void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[16];
-
-  load_coeff(input, &in[0]);
-  load_coeff(input + 16, &in[1]);
-  load_coeff(input + 32, &in[2]);
-  load_coeff(input + 48, &in[3]);
-
-  transpose_col_to_row_nz4x4(in);
-  idct16_10(in);
-
-  transpose_col_to_row_nz4x16(in);
-  idct16_10(in);
-
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-// Note:
-//  For 16x16 int16_t matrix
-//  transpose first 8 columns into first 8 rows.
-//  Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]).
-//  After transposing, the 8 row vectors are in in[8].
-void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) {
-  __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]);
-
-  const __m256i v0 = _mm256_unpacklo_epi32(u0, u2);
-  const __m256i v1 = _mm256_unpackhi_epi32(u0, u2);
-  const __m256i v2 = _mm256_unpacklo_epi32(u1, u3);
-  const __m256i v3 = _mm256_unpackhi_epi32(u1, u3);
-
-  u0 = _mm256_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm256_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm256_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  const __m256i v4 = _mm256_unpacklo_epi32(u0, u2);
-  const __m256i v5 = _mm256_unpackhi_epi32(u0, u2);
-  const __m256i v6 = _mm256_unpacklo_epi32(u1, u3);
-  const __m256i v7 = _mm256_unpackhi_epi32(u1, u3);
-
-  in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0);
-  in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3);
-  in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0);
-  in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3);
-  in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0);
-  in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3);
-  in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0);
-  in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3);
-}
-
-// Note:
-//  For 16x16 int16_t matrix
-//  transpose first 8 columns into first 8 rows.
-//  Since only matrix left 8x16 are non-zero, the input are total 16 rows
-//  (in[16]).
-//  After transposing, the 8 row vectors are in in[8]. All else are zero.
-static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) {
-  transpose_col_to_row_nz8x8(in);
-  transpose_col_to_row_nz8x8(&in[8]);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20);
-  }
-}
-
-static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28);
-  __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04);
-
-  const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20);
-  __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12);
-
-  const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16);
-  const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24);
-  const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08);
-
-  const __m256i u4 = _mm256_add_epi16(t4, t5);
-  const __m256i u5 = _mm256_sub_epi16(t4, t5);
-  const __m256i u6 = _mm256_sub_epi16(t7, t6);
-  const __m256i u7 = _mm256_add_epi16(t7, t6);
-
-  const __m256i t0 = _mm256_add_epi16(u0, u3);
-  const __m256i t1 = _mm256_add_epi16(u1, u2);
-  const __m256i t2 = _mm256_sub_epi16(u1, u2);
-  const __m256i t3 = _mm256_sub_epi16(u0, u3);
-
-  t4 = u4;
-  t7 = u7;
-
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
-
-  out[0] = _mm256_add_epi16(t0, t7);
-  out[1] = _mm256_add_epi16(t1, t6);
-  out[2] = _mm256_add_epi16(t2, t5);
-  out[3] = _mm256_add_epi16(t3, t4);
-  out[4] = _mm256_sub_epi16(t3, t4);
-  out[5] = _mm256_sub_epi16(t2, t5);
-  out[6] = _mm256_sub_epi16(t1, t6);
-  out[7] = _mm256_sub_epi16(t0, t7);
-}
-
-static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) {
-  const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30);
-  __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02);
-
-  const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-  const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-  __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18);
-  __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14);
-
-  const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-  const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-  __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22);
-  __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10);
-
-  const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-  __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26);
-  __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06);
-
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-  v0 = _mm256_add_epi16(t0, t1);
-  v1 = _mm256_sub_epi16(t0, t1);
-  v2 = _mm256_sub_epi16(t3, t2);
-  v3 = _mm256_add_epi16(t2, t3);
-  v4 = _mm256_add_epi16(t4, t5);
-  v5 = _mm256_sub_epi16(t4, t5);
-  v6 = _mm256_sub_epi16(t7, t6);
-  v7 = _mm256_add_epi16(t6, t7);
-
-  t0 = v0;
-  t7 = v7;
-  t3 = v3;
-  t4 = v4;
-  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
-  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
-  v0 = _mm256_add_epi16(t0, t3);
-  v1 = _mm256_add_epi16(t1, t2);
-  v2 = _mm256_sub_epi16(t1, t2);
-  v3 = _mm256_sub_epi16(t0, t3);
-  v4 = _mm256_sub_epi16(t7, t4);
-  v5 = _mm256_sub_epi16(t6, t5);
-  v6 = _mm256_add_epi16(t6, t5);
-  v7 = _mm256_add_epi16(t7, t4);
-
-  // stage 6, (8-15)
-  out[0] = v0;
-  out[1] = v1;
-  out[6] = v6;
-  out[7] = v7;
-  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
-  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]);
-  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]);
-}
-
-static INLINE void idct16_38(__m256i *in /*in[16]*/) {
-  __m256i out[16];
-  idct16_38_first_half(in, out);
-  idct16_38_second_half(in, &out[8]);
-  add_sub_butterfly(out, in, 16);
-}
-
-void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[16];
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    load_coeff(input + (i << 4), &in[i]);
-  }
-
-  transpose_col_to_row_nz8x8(in);
-  idct16_38(in);
-
-  transpose_col_to_row_nz8x16(in);
-  idct16_38(in);
-
-  store_buffer_16xN(in, stride, dest, 16);
-}
-
-static INLINE int calculate_dc(const tran_low_t *input) {
-  int dc = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  dc = (int)dct_const_round_shift(dc * cospi_16_64);
-  dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS);
-  return dc;
-}
-
-void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const int dc = calculate_dc(input);
-  if (dc == 0) return;
-
-  const __m256i dc_value = _mm256_set1_epi16(dc);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    recon_and_store(&dc_value, dest);
-    dest += stride;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// 32x32 partial IDCT
-
-void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const int dc = calculate_dc(input);
-  if (dc == 0) return;
-
-  const __m256i dc_value = _mm256_set1_epi16(dc);
-
-  int i;
-  for (i = 0; i < 32; ++i) {
-    recon_and_store(&dc_value, dest);
-    recon_and_store(&dc_value, dest + 16);
-    dest += stride;
-  }
-}
-
-static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    load_coeff(input, &in[i]);
-    load_coeff(input + 16, &in[i + 16]);
-    input += 32;
-  }
-}
-
-// Note:
-//  We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we
-// operate coefficients on __m256i. Our operation capacity doubles for each
-// instruction.
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)            \
-  do {                                              \
-    tmp0 = _mm256_madd_epi16(x0, co0);              \
-    tmp1 = _mm256_madd_epi16(x1, co0);              \
-    tmp2 = _mm256_madd_epi16(x0, co1);              \
-    tmp3 = _mm256_madd_epi16(x1, co1);              \
-    tmp0 = _mm256_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm256_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm256_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm256_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m256i *x0, const __m256i *x1,
-                             const __m256i *c0, const __m256i *c1, __m256i *y0,
-                             __m256i *y1) {
-  __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm256_unpacklo_epi16(*x0, *x1);
-  u1 = _mm256_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm256_packs_epi32(tmp0, tmp1);
-  *y1 = _mm256_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0,
-                                  const __m256i *c1) {
-  __m256i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm256_unpacklo_epi16(*x0, *x1);
-  u1 = _mm256_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm256_packs_epi32(tmp0, tmp1);
-  *x1 = _mm256_packs_epi32(tmp2, tmp3);
-}
-
-// For each 16x32 block __m256i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m256i in[32]
-static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/,
-                                        __m256i *out /*out[16]*/) {
-  __m256i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m256i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64);
-    const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm256_add_epi16(u8, u9);
-  v9 = _mm256_sub_epi16(u8, u9);
-  v14 = _mm256_sub_epi16(u15, u14);
-  v15 = _mm256_add_epi16(u15, u14);
-
-  {
-    const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64);
-    const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm256_sub_epi16(u11, u10);
-  v11 = _mm256_add_epi16(u11, u10);
-  v12 = _mm256_add_epi16(u12, u13);
-  v13 = _mm256_sub_epi16(u12, u13);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(v8, v11);
-  out[1] = _mm256_add_epi16(v9, v10);
-  out[6] = _mm256_add_epi16(v14, v13);
-  out[7] = _mm256_add_epi16(v15, v12);
-
-  out[2] = _mm256_sub_epi16(v9, v10);
-  out[3] = _mm256_sub_epi16(v8, v11);
-  out[4] = _mm256_sub_epi16(v15, v12);
-  out[5] = _mm256_sub_epi16(v14, v13);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m256i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m256i in[32]
-static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/,
-                                        __m256i *out /*out[8]*/) {
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64);
-    const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm256_add_epi16(u4, u5);
-  v5 = _mm256_sub_epi16(u4, u5);
-  v6 = _mm256_sub_epi16(u7, u6);
-  v7 = _mm256_add_epi16(u7, u6);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm256_add_epi16(u0, u3);
-  v1 = _mm256_add_epi16(u1, u2);
-  v2 = _mm256_sub_epi16(u1, u2);
-  v3 = _mm256_sub_epi16(u0, u3);
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v2, v5);
-  out[3] = _mm256_add_epi16(v3, v4);
-  out[4] = _mm256_sub_epi16(v3, v4);
-  out[5] = _mm256_sub_epi16(v2, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m256i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m256i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/,
-                                          __m256i *out /*out[16]*/) {
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64);
-    const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64);
-    const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64);
-    const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64);
-    const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64);
-    const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
-    const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64);
-    const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm256_add_epi16(u16, u17);
-  v17 = _mm256_sub_epi16(u16, u17);
-  v18 = _mm256_sub_epi16(u19, u18);
-  v19 = _mm256_add_epi16(u19, u18);
-
-  v20 = _mm256_add_epi16(u20, u21);
-  v21 = _mm256_sub_epi16(u20, u21);
-  v22 = _mm256_sub_epi16(u23, u22);
-  v23 = _mm256_add_epi16(u23, u22);
-
-  v24 = _mm256_add_epi16(u24, u25);
-  v25 = _mm256_sub_epi16(u24, u25);
-  v26 = _mm256_sub_epi16(u27, u26);
-  v27 = _mm256_add_epi16(u27, u26);
-
-  v28 = _mm256_add_epi16(u28, u29);
-  v29 = _mm256_sub_epi16(u28, u29);
-  v30 = _mm256_sub_epi16(u31, u30);
-  v31 = _mm256_add_epi16(u31, u30);
-
-  {
-    const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-    const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-    const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-
-  u24 = _mm256_add_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u27 = _mm256_sub_epi16(v24, v27);
-
-  u28 = _mm256_sub_epi16(v31, v28);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-  u31 = _mm256_add_epi16(v28, v31);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(u16, u23);
-  out[1] = _mm256_add_epi16(u17, u22);
-  out[2] = _mm256_add_epi16(u18, u21);
-  out[3] = _mm256_add_epi16(u19, u20);
-  out[4] = _mm256_sub_epi16(u19, u20);
-  out[5] = _mm256_sub_epi16(u18, u21);
-  out[6] = _mm256_sub_epi16(u17, u22);
-  out[7] = _mm256_sub_epi16(u16, u23);
-
-  out[8] = _mm256_sub_epi16(u31, u24);
-  out[9] = _mm256_sub_epi16(u30, u25);
-  out[10] = _mm256_sub_epi16(u29, u26);
-  out[11] = _mm256_sub_epi16(u28, u27);
-  out[12] = _mm256_add_epi16(u27, u28);
-  out[13] = _mm256_add_epi16(u26, u29);
-  out[14] = _mm256_add_epi16(u25, u30);
-  out[15] = _mm256_add_epi16(u24, u31);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/,
-                                          __m256i *out /*out[32]*/) {
-  __m256i temp[16];
-  idct32_full_16x32_quarter_1(in, temp);
-  idct32_full_16x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_16x32(const __m256i *in /*in[32]*/,
-                         __m256i *out /*out[32]*/) {
-  __m256i temp[32];
-  idct32_full_16x32_quarter_1_2(in, temp);
-  idct32_full_16x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  __m256i col[64], in[32];
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    load_buffer_32x16(input, in);
-    input += 32 << 4;
-
-    mm256_transpose_16x16(in, in);
-    mm256_transpose_16x16(&in[16], &in[16]);
-    idct32_16x32(in, col + (i << 5));
-  }
-
-  for (i = 0; i < 2; ++i) {
-    int j = i << 4;
-    mm256_transpose_16x16(col + j, in);
-    mm256_transpose_16x16(col + j + 32, &in[16]);
-    idct32_16x32(in, in);
-    store_buffer_16xN(in, stride, dest, 32);
-    dest += 16;
-  }
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/,
-                                       __m256i *out /*out[8]*/) {
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm256_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm256_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm256_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm256_add_epi16(u0, u3);
-  v1 = _mm256_add_epi16(u1, u2);
-  v2 = _mm256_sub_epi16(u1, u2);
-  v3 = _mm256_sub_epi16(u0, u3);
-
-  {
-    const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m256i stk3_2 =
-        pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm256_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm256_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm256_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm256_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm256_add_epi16(u4, u5);
-  v5 = _mm256_sub_epi16(u4, u5);
-  v6 = _mm256_sub_epi16(u7, u6);
-  v7 = _mm256_add_epi16(u7, u6);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm256_add_epi16(v0, v7);
-  out[1] = _mm256_add_epi16(v1, v6);
-  out[2] = _mm256_add_epi16(v2, v5);
-  out[3] = _mm256_add_epi16(v3, v4);
-  out[4] = _mm256_sub_epi16(v3, v4);
-  out[5] = _mm256_sub_epi16(v2, v5);
-  out[6] = _mm256_sub_epi16(v1, v6);
-  out[7] = _mm256_sub_epi16(v0, v7);
-}
-
-static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/,
-                                       __m256i *out /*out[8]*/) {
-  __m256i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m256i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m256i stk2_2 =
-        pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m256i stk2_6 =
-        pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm256_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm256_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm256_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm256_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm256_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm256_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm256_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm256_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm256_add_epi16(u8, u9);
-  v9 = _mm256_sub_epi16(u8, u9);
-  v10 = _mm256_sub_epi16(u11, u10);
-  v11 = _mm256_add_epi16(u11, u10);
-  v12 = _mm256_add_epi16(u12, u13);
-  v13 = _mm256_sub_epi16(u12, u13);
-  v14 = _mm256_sub_epi16(u15, u14);
-  v15 = _mm256_add_epi16(u15, u14);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(v8, v11);
-  out[1] = _mm256_add_epi16(v9, v10);
-  out[2] = _mm256_sub_epi16(v9, v10);
-  out[3] = _mm256_sub_epi16(v8, v11);
-  out[4] = _mm256_sub_epi16(v15, v12);
-  out[5] = _mm256_sub_epi16(v14, v13);
-  out[6] = _mm256_add_epi16(v14, v13);
-  out[7] = _mm256_add_epi16(v15, v12);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/,
-                                     __m256i *out /*out[32]*/) {
-  __m256i temp[16];
-  idct32_16x32_135_quarter_1(in, temp);
-  idct32_16x32_135_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/,
-                                     __m256i *out /*out[32]*/) {
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m256i stk1_2 =
-        pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m256i stk1_6 =
-        pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m256i stk1_10 =
-        pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m256i stk1_14 =
-        pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm256_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm256_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm256_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm256_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm256_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm256_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm256_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm256_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm256_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm256_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm256_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm256_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm256_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm256_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm256_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm256_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm256_add_epi16(u16, u17);
-  v17 = _mm256_sub_epi16(u16, u17);
-  v18 = _mm256_sub_epi16(u19, u18);
-  v19 = _mm256_add_epi16(u19, u18);
-
-  v20 = _mm256_add_epi16(u20, u21);
-  v21 = _mm256_sub_epi16(u20, u21);
-  v22 = _mm256_sub_epi16(u23, u22);
-  v23 = _mm256_add_epi16(u23, u22);
-
-  v24 = _mm256_add_epi16(u24, u25);
-  v25 = _mm256_sub_epi16(u24, u25);
-  v26 = _mm256_sub_epi16(u27, u26);
-  v27 = _mm256_add_epi16(u27, u26);
-
-  v28 = _mm256_add_epi16(u28, u29);
-  v29 = _mm256_sub_epi16(u28, u29);
-  v30 = _mm256_sub_epi16(u31, u30);
-  v31 = _mm256_add_epi16(u31, u30);
-
-  {
-    const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-    const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-    const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-
-  u24 = _mm256_add_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u27 = _mm256_sub_epi16(v24, v27);
-  u28 = _mm256_sub_epi16(v31, v28);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-  u31 = _mm256_add_epi16(v28, v31);
-
-  {
-    const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-    const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm256_add_epi16(u16, u23);
-  out[1] = _mm256_add_epi16(u17, u22);
-  out[2] = _mm256_add_epi16(u18, u21);
-  out[3] = _mm256_add_epi16(u19, u20);
-  v20 = _mm256_sub_epi16(u19, u20);
-  v21 = _mm256_sub_epi16(u18, u21);
-  v22 = _mm256_sub_epi16(u17, u22);
-  v23 = _mm256_sub_epi16(u16, u23);
-
-  v24 = _mm256_sub_epi16(u31, u24);
-  v25 = _mm256_sub_epi16(u30, u25);
-  v26 = _mm256_sub_epi16(u29, u26);
-  v27 = _mm256_sub_epi16(u28, u27);
-  out[12] = _mm256_add_epi16(u27, u28);
-  out[13] = _mm256_add_epi16(u26, u29);
-  out[14] = _mm256_add_epi16(u25, u30);
-  out[15] = _mm256_add_epi16(u24, u31);
-
-  {
-    const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-    const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
-static void idct32_16x32_135(__m256i *in /*in[32]*/) {
-  __m256i out[32];
-  idct32_16x32_quarter_1_2(in, out);
-  idct32_16x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in,
-                                          int size) {
-  int i = 0;
-  while (i < size) {
-    load_coeff(coeff + (i << 5), &in[i]);
-    i += 1;
-  }
-}
-
-static INLINE void zero_buffer(__m256i *in, int num) {
-  int i;
-  for (i = 0; i < num; ++i) {
-    in[i] = _mm256_setzero_si256();
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  __m256i in[32];
-  zero_buffer(in, 32);
-  load_buffer_from_32x32(input, in, 16);
-  mm256_transpose_16x16(in, in);
-  idct32_16x32_135(in);
-
-  __m256i out[32];
-  mm256_transpose_16x16(in, out);
-  idct32_16x32_135(out);
-  store_buffer_16xN(out, stride, dest, 32);
-  mm256_transpose_16x16(&in[16], in);
-  idct32_16x32_135(in);
-  store_buffer_16xN(in, stride, dest + 16, 32);
-}
-
-static void idct32_34_first_half(const __m256i *in, __m256i *stp1) {
-  const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m256i x0, x1, x4, x5, x6, x7;
-  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm256_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm256_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm256_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm256_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm256_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm256_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm256_add_epi16(v0, v15);
-  stp1[15] = _mm256_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm256_mulhrs_epi16(in[2], stk2_0);          // stp2_8
-  u1 = _mm256_mulhrs_epi16(in[6], stk2_6);          // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm256_add_epi16(u0, u1);
-  v9 = _mm256_add_epi16(u4, u6);
-  v10 = _mm256_sub_epi16(u4, u6);
-  v11 = _mm256_sub_epi16(u0, u1);
-  v12 = _mm256_sub_epi16(u2, u3);
-  v13 = _mm256_sub_epi16(u5, u7);
-  v14 = _mm256_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm256_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm256_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm256_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm256_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm256_add_epi16(v1, v14);
-  stp1[14] = _mm256_sub_epi16(v1, v14);
-
-  stp1[2] = _mm256_add_epi16(v2, v13);
-  stp1[13] = _mm256_sub_epi16(v2, v13);
-
-  v3 = _mm256_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm256_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm256_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm256_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm256_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm256_add_epi16(v3, v12);
-  stp1[12] = _mm256_sub_epi16(v3, v12);
-
-  stp1[6] = _mm256_add_epi16(v6, v9);
-  stp1[9] = _mm256_sub_epi16(v6, v9);
-
-  stp1[7] = _mm256_add_epi16(v7, v8);
-  stp1[8] = _mm256_sub_epi16(v7, v8);
-
-  stp1[4] = _mm256_add_epi16(v4, v11);
-  stp1[11] = _mm256_sub_epi16(v4, v11);
-
-  stp1[5] = _mm256_add_epi16(v5, v10);
-  stp1[10] = _mm256_sub_epi16(v5, v10);
-}
-
-static void idct32_34_second_half(const __m256i *in, __m256i *stp1) {
-  const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64);
-  const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64);
-  const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64);
-  const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64);
-  const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
-  __m256i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m256i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m256i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m256i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm256_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm256_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm256_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm256_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm256_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm256_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm256_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm256_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm256_add_epi16(v16, v19);
-  u17 = _mm256_add_epi16(v17, v18);
-  u18 = _mm256_sub_epi16(v17, v18);
-  u19 = _mm256_sub_epi16(v16, v19);
-  u20 = _mm256_sub_epi16(v23, v20);
-  u21 = _mm256_sub_epi16(v22, v21);
-  u22 = _mm256_add_epi16(v22, v21);
-  u23 = _mm256_add_epi16(v23, v20);
-  u24 = _mm256_add_epi16(v24, v27);
-  u27 = _mm256_sub_epi16(v24, v27);
-  u25 = _mm256_add_epi16(v25, v26);
-  u26 = _mm256_sub_epi16(v25, v26);
-  u28 = _mm256_sub_epi16(v31, v28);
-  u31 = _mm256_add_epi16(v28, v31);
-  u29 = _mm256_sub_epi16(v30, v29);
-  u30 = _mm256_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[0] = _mm256_add_epi16(u16, u23);
-  stp1[7] = _mm256_sub_epi16(u16, u23);
-
-  stp1[1] = _mm256_add_epi16(u17, u22);
-  stp1[6] = _mm256_sub_epi16(u17, u22);
-
-  stp1[2] = _mm256_add_epi16(u18, u21);
-  stp1[5] = _mm256_sub_epi16(u18, u21);
-
-  stp1[3] = _mm256_add_epi16(u19, u20);
-  stp1[4] = _mm256_sub_epi16(u19, u20);
-
-  stp1[8] = _mm256_sub_epi16(u31, u24);
-  stp1[15] = _mm256_add_epi16(u24, u31);
-
-  stp1[9] = _mm256_sub_epi16(u30, u25);
-  stp1[14] = _mm256_add_epi16(u25, u30);
-
-  stp1[10] = _mm256_sub_epi16(u29, u26);
-  stp1[13] = _mm256_add_epi16(u26, u29);
-
-  stp1[11] = _mm256_sub_epi16(u28, u27);
-  stp1[12] = _mm256_add_epi16(u27, u28);
-
-  butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0);
-}
-
-// 16x16 block input __m256i in[32], output 16x32 __m256i in[32]
-static void idct32_16x32_34(__m256i *in /*in[32]*/) {
-  __m256i out[32];
-  idct32_34_first_half(in, out);
-  idct32_34_second_half(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  __m256i in[32];
-  zero_buffer(in, 32);
-  load_buffer_from_32x32(input, in, 8);
-  mm256_transpose_16x16(in, in);
-  idct32_16x32_34(in);
-
-  __m256i out[32];
-  mm256_transpose_16x16(in, out);
-  idct32_16x32_34(out);
-  store_buffer_16xN(out, stride, dest, 32);
-  mm256_transpose_16x16(&in[16], in);
-  idct32_16x32_34(in);
-  store_buffer_16xN(in, stride, dest + 16, 32);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
deleted file mode 100644
index 26c5cfe59..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
-#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
-
-#include <immintrin.h>
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_avx2.h"
-
-static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-  if (sizeof(tran_low_t) == 4) {
-    *in = _mm256_setr_epi16(
-        (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
-        (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
-        (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
-        (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
-        (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
-        (int16_t)coeff[15]);
-  } else {
-    *in = _mm256_loadu_si256((const __m256i *)coeff);
-  }
-}
-
-static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
-  int i = 0;
-  while (i < 16) {
-    load_coeff(coeff + (i << 4), &in[i]);
-    i += 1;
-  }
-}
-
-static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i x = _mm_loadu_si128((__m128i const *)output);
-  __m128i p0 = _mm_unpacklo_epi8(x, zero);
-  __m128i p1 = _mm_unpackhi_epi8(x, zero);
-
-  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
-  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
-  x = _mm_packus_epi16(p0, p1);
-  _mm_storeu_si128((__m128i *)output, x);
-}
-
-#define IDCT_ROUNDING_POS (6)
-static INLINE void store_buffer_16xN(__m256i *in, const int stride,
-                                     uint8_t *output, int num) {
-  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
-  int i = 0;
-
-  while (i < num) {
-    in[i] = _mm256_adds_epi16(in[i], rounding);
-    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
-    recon_and_store(&in[i], output + i * stride);
-    i += 1;
-  }
-}
-
-static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
-                                     const __m256i *c0, const __m256i *c1,
-                                     __m256i *b0, __m256i *b1) {
-  __m256i x0, x1;
-  x0 = _mm256_unpacklo_epi16(*a0, *a1);
-  x1 = _mm256_unpackhi_epi16(*a0, *a1);
-  *b0 = butter_fly(&x0, &x1, c0);
-  *b1 = butter_fly(&x0, &x1, c1);
-}
-
-void av1_idct16_avx2(__m256i *in);
-
-#endif  // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
deleted file mode 100644
index 86ce928b7..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c
+++ /dev/null
@@ -1,3500 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-void aom_idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  array_transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void aom_iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  array_transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
-  }
-
-void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from aom_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void aom_idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from aom_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
-}
-
-void aom_iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest + 0, dc_value);
-    RECON_AND_STORE(dest + 8, dc_value);
-    dest += stride;
-  }
-}
-
-void iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
-}
-
-void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
-
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-#define IDCT32(in0, in1)                                                       \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]);           \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]);           \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]);          \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]);          \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]);            \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]);            \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]);            \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]);            \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]);           \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]);           \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]);          \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]);          \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]);          \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]);          \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]);           \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]);           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]);           \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]);           \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]);          \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]);          \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]);          \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]);          \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]);           \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]);           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]);           \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]);           \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]);          \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]);          \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]);            \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]);            \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]);            \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]);            \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32(in, in + 16)
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32(in, in + 16)
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  if (a == 0) return;
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
-  }
-}
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its input - the caller is expected to have done that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void idct32_8col(__m128i *in0, __m128i *in1) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  IDCT32(in0, in1)
-
-  // 2_D: Calculate the results and store them to destination.
-  in0[0] = _mm_add_epi16(stp1_0, stp1_31);
-  in0[1] = _mm_add_epi16(stp1_1, stp1_30);
-  in0[2] = _mm_add_epi16(stp1_2, stp1_29);
-  in0[3] = _mm_add_epi16(stp1_3, stp1_28);
-  in0[4] = _mm_add_epi16(stp1_4, stp1_27);
-  in0[5] = _mm_add_epi16(stp1_5, stp1_26);
-  in0[6] = _mm_add_epi16(stp1_6, stp1_25);
-  in0[7] = _mm_add_epi16(stp1_7, stp1_24);
-  in0[8] = _mm_add_epi16(stp1_8, stp1_23);
-  in0[9] = _mm_add_epi16(stp1_9, stp1_22);
-  in0[10] = _mm_add_epi16(stp1_10, stp1_21);
-  in0[11] = _mm_add_epi16(stp1_11, stp1_20);
-  in0[12] = _mm_add_epi16(stp1_12, stp1_19);
-  in0[13] = _mm_add_epi16(stp1_13, stp1_18);
-  in0[14] = _mm_add_epi16(stp1_14, stp1_17);
-  in0[15] = _mm_add_epi16(stp1_15, stp1_16);
-  in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
-  in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
-  in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
-  in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
-  in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
-  in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
-  in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
-  in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
-  in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
-  in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
-  in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
-  in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
-  in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
-  in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
-  in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
-  in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
deleted file mode 100644
index 342816977..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
-// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
-  if (sizeof(tran_low_t) == 4) {
-    return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                          data[6], data[7]);
-  } else {
-    return _mm_load_si128((const __m128i *)data);
-  }
-}
-
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
-}
-
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-void iadst16_8col(__m128i *in);
-void idct16_8col(__m128i *in);
-void aom_idct4_sse2(__m128i *in);
-void aom_idct8_sse2(__m128i *in);
-void aom_idct16_sse2(__m128i *in0, __m128i *in1);
-void aom_iadst4_sse2(__m128i *in);
-void aom_iadst8_sse2(__m128i *in);
-void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
-void idct32_8col(__m128i *in0, __m128i *in1);
-
-#endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
deleted file mode 100644
index 9d006797b..000000000
--- a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c
+++ /dev/null
@@ -1,1333 +0,0 @@
-/*
- *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    {
-      /* Stage1 */
-      {
-        const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
-        const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
-        const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
-        const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
-
-        {
-          tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-          tmp1 = _mm_madd_epi16(hi_17, stg1_0);
-          tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-          tmp3 = _mm_madd_epi16(hi_17, stg1_1);
-          tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-          tmp5 = _mm_madd_epi16(hi_35, stg1_2);
-          tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-          tmp7 = _mm_madd_epi16(hi_35, stg1_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-          tmp6 = _mm_add_epi32(tmp6, rounding);
-          tmp7 = _mm_add_epi32(tmp7, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-          tmp4 = _mm_srai_epi32(tmp4, 14);
-          tmp5 = _mm_srai_epi32(tmp5, 14);
-          tmp6 = _mm_srai_epi32(tmp6, 14);
-          tmp7 = _mm_srai_epi32(tmp7, 14);
-
-          stp1_4 = _mm_packs_epi32(tmp0, tmp1);
-          stp1_7 = _mm_packs_epi32(tmp2, tmp3);
-          stp1_5 = _mm_packs_epi32(tmp4, tmp5);
-          stp1_6 = _mm_packs_epi32(tmp6, tmp7);
-        }
-      }
-
-      /* Stage2 */
-      {
-        const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
-        const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
-
-        {
-          tmp0 = _mm_unpacklo_epi16(in0, in4);
-          tmp1 = _mm_unpackhi_epi16(in0, in4);
-
-          tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-          tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-          tmp4 = _mm_madd_epi16(tmp0, stk2_1);
-          tmp5 = _mm_madd_epi16(tmp1, stk2_1);
-
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-
-          tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-          tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-          tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-          tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-          stp2_0 = _mm_packs_epi32(tmp2, tmp3);
-          stp2_1 = _mm_packs_epi32(tmp4, tmp5);
-
-          tmp0 = _mm_madd_epi16(lo_26, stg2_2);
-          tmp1 = _mm_madd_epi16(hi_26, stg2_2);
-          tmp2 = _mm_madd_epi16(lo_26, stg2_3);
-          tmp3 = _mm_madd_epi16(hi_26, stg2_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-
-          stp2_2 = _mm_packs_epi32(tmp0, tmp1);
-          stp2_3 = _mm_packs_epi32(tmp2, tmp3);
-        }
-
-        stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-        stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-        stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-        stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-      }
-
-      /* Stage3 */
-      {
-        stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-        stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-        stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-        stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-        tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-        tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-        tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-        tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-        tmp4 = _mm_madd_epi16(tmp0, stk2_0);
-        tmp5 = _mm_madd_epi16(tmp1, stk2_0);
-
-        tmp2 = _mm_add_epi32(tmp2, rounding);
-        tmp3 = _mm_add_epi32(tmp3, rounding);
-        tmp4 = _mm_add_epi32(tmp4, rounding);
-        tmp5 = _mm_add_epi32(tmp5, rounding);
-
-        tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-        tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-        tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-        tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-        stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-        stp1_6 = _mm_packs_epi32(tmp4, tmp5);
-      }
-
-      /* Stage4  */
-      in0 = _mm_add_epi16(stp1_0, stp2_7);
-      in1 = _mm_add_epi16(stp1_1, stp1_6);
-      in2 = _mm_add_epi16(stp1_2, stp1_5);
-      in3 = _mm_add_epi16(stp1_3, stp2_4);
-      in4 = _mm_sub_epi16(stp1_3, stp2_4);
-      in5 = _mm_sub_epi16(stp1_2, stp1_5);
-      in6 = _mm_sub_epi16(stp1_1, stp1_6);
-      in7 = _mm_sub_epi16(stp1_0, stp2_7);
-    }
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-
-  // Stage1
-  tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
-  tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
-  tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
-  tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
-
-  stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
-  stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
-
-  // Stage2
-  tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-
-  tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
-  tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
-  stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
-
-  tmp0 = _mm_add_epi16(stp1_4, stp1_5);
-  tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
-  stp2_4 = tmp0;
-  stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-  stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-  tmp1 = _mm_madd_epi16(tmp0, stg3_0);
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);  // stg3_1 = stk2_0
-
-  tmp1 = _mm_add_epi32(tmp1, rounding);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-  stp1_5 = _mm_packs_epi32(tmp1, tmp2);
-
-  // Stage3
-  tmp2 = _mm_add_epi16(stp2_0, stp2_2);
-  tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
-
-  stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
-  stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
-
-  // Stage4
-  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  /* Stage1 */
-  stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
-  stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
-  stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
-  stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
-
-  /* Stage2 */
-  stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
-
-  stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
-  stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
-
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-  tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-
-  /* Stage4  */
-  in0 = _mm_add_epi16(stp1_0, stp2_7);
-  in1 = _mm_add_epi16(stp1_1, stp1_6);
-  in2 = _mm_add_epi16(stp1_2, stp1_5);
-  in3 = _mm_add_epi16(stp1_3, stp2_4);
-  in4 = _mm_sub_epi16(stp1_3, stp2_4);
-  in5 = _mm_sub_epi16(stp1_2, stp1_5);
-  in6 = _mm_sub_epi16(stp1_1, stp1_6);
-  in7 = _mm_sub_epi16(stp1_0, stp2_7);
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
-}
-
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
-  do {                                           \
-    tmp0 = _mm_madd_epi16(x0, co0);              \
-    tmp1 = _mm_madd_epi16(x1, co0);              \
-    tmp2 = _mm_madd_epi16(x0, co1);              \
-    tmp3 = _mm_madd_epi16(x1, co1);              \
-    tmp0 = _mm_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
-                             const __m128i *c0, const __m128i *c1, __m128i *y0,
-                             __m128i *y1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm_packs_epi32(tmp0, tmp1);
-  *y1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
-                                  const __m128i *c1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm_packs_epi32(tmp0, tmp1);
-  *x1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x4, x5, x6, x7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm_add_epi16(v0, v15);
-  stp1[15] = _mm_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm_mulhrs_epi16(in[2], stk2_0);             // stp2_8
-  u1 = _mm_mulhrs_epi16(in[6], stk2_6);             // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm_add_epi16(u0, u1);
-  v9 = _mm_add_epi16(u4, u6);
-  v10 = _mm_sub_epi16(u4, u6);
-  v11 = _mm_sub_epi16(u0, u1);
-  v12 = _mm_sub_epi16(u2, u3);
-  v13 = _mm_sub_epi16(u5, u7);
-  v14 = _mm_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm_add_epi16(v1, v14);
-  stp1[14] = _mm_sub_epi16(v1, v14);
-
-  stp1[2] = _mm_add_epi16(v2, v13);
-  stp1[13] = _mm_sub_epi16(v2, v13);
-
-  v3 = _mm_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm_add_epi16(v3, v12);
-  stp1[12] = _mm_sub_epi16(v3, v12);
-
-  stp1[6] = _mm_add_epi16(v6, v9);
-  stp1[9] = _mm_sub_epi16(v6, v9);
-
-  stp1[7] = _mm_add_epi16(v7, v8);
-  stp1[8] = _mm_sub_epi16(v7, v8);
-
-  stp1[4] = _mm_add_epi16(v4, v11);
-  stp1[11] = _mm_sub_epi16(v4, v11);
-
-  stp1[5] = _mm_add_epi16(v5, v10);
-  stp1[10] = _mm_sub_epi16(v5, v10);
-}
-
-static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-  u24 = _mm_add_epi16(v24, v27);
-  u27 = _mm_sub_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u28 = _mm_sub_epi16(v31, v28);
-  u31 = _mm_add_epi16(v28, v31);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[16] = _mm_add_epi16(u16, u23);
-  stp1[23] = _mm_sub_epi16(u16, u23);
-
-  stp1[17] = _mm_add_epi16(u17, u22);
-  stp1[22] = _mm_sub_epi16(u17, u22);
-
-  stp1[18] = _mm_add_epi16(u18, u21);
-  stp1[21] = _mm_sub_epi16(u18, u21);
-
-  stp1[19] = _mm_add_epi16(u19, u20);
-  stp1[20] = _mm_sub_epi16(u19, u20);
-
-  stp1[24] = _mm_sub_epi16(u31, u24);
-  stp1[31] = _mm_add_epi16(u24, u31);
-
-  stp1[25] = _mm_sub_epi16(u30, u25);
-  stp1[30] = _mm_add_epi16(u25, u30);
-
-  stp1[26] = _mm_sub_epi16(u29, u26);
-  stp1[29] = _mm_add_epi16(u26, u29);
-
-  stp1[27] = _mm_sub_epi16(u28, u27);
-  stp1[28] = _mm_add_epi16(u27, u28);
-
-  butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
-}
-
-// Only upper-left 8x8 has non-zero coeff
-void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  __m128i in[32], col[32];
-  __m128i stp1[32];
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  array_transpose_8x8(in, in);
-  idct32_34_first_half(in, stp1);
-  idct32_34_second_half(in, stp1);
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  add_sub_butterfly(stp1, col, 32);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    idct32_34_first_half(in, stp1);
-    idct32_34_second_half(in, stp1);
-
-    // 2_D: Calculate the results and store them to destination.
-    add_sub_butterfly(stp1, in, 32);
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-// in0[16] represents the left 8x16 block
-// in1[16] represents the right 8x16 block
-static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
-                              __m128i *in1) {
-  int i;
-  for (i = 0; i < 16; i++) {
-    in0[i] = load_input_data(input);
-    in1[i] = load_input_data(input + 8);
-    input += 32;
-  }
-}
-
-static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
-                                    __m128i *out1) {
-  array_transpose_8x8(in0, out0);
-  array_transpose_8x8(&in0[8], out1);
-  array_transpose_8x8(in1, &out0[8]);
-  array_transpose_8x8(&in1[8], &out1[8]);
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  {
-    const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_8x32_135_quarter_1(in, temp);
-  idct32_8x32_135_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  v20 = _mm_sub_epi16(u19, u20);
-  v21 = _mm_sub_epi16(u18, u21);
-  v22 = _mm_sub_epi16(u17, u22);
-  v23 = _mm_sub_epi16(u16, u23);
-
-  v24 = _mm_sub_epi16(u31, u24);
-  v25 = _mm_sub_epi16(u30, u25);
-  v26 = _mm_sub_epi16(u29, u26);
-  v27 = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 8x16 block, input __m128i in[16], output __m128i in[32]
-static void idct32_8x32_135(__m128i *in /*in[32]*/) {
-  __m128i out[32];
-  idct32_8x32_quarter_1_2(in, out);
-  idct32_8x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  int j = 0;
-  while (j < 32) {
-    in[j] = _mm_adds_epi16(in[j], final_rounding);
-    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
-    in[j] = _mm_srai_epi16(in[j], 6);
-    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
-    RECON_AND_STORE(dst, in[j]);
-    dst += stride;
-    RECON_AND_STORE(dst, in[j + 1]);
-    dst += stride;
-    j += 2;
-  }
-}
-
-static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
-                                   int stride) {
-  store_buffer_8x32(in0, dest, stride);
-  store_buffer_8x32(in1, dest + 8, stride);
-}
-
-static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
-  idct32_8x32_135(col0);
-  idct32_8x32_135(col1);
-}
-
-typedef enum { left_16, right_16 } ColsIndicator;
-
-static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
-                                     ColsIndicator cols) {
-  switch (cols) {
-    case left_16: {
-      int i;
-      array_transpose_16x16(in0, in1);
-      for (i = 0; i < 16; ++i) {
-        store[i] = in0[16 + i];
-        store[16 + i] = in1[16 + i];
-      }
-      break;
-    }
-    case right_16: {
-      array_transpose_16x16_2(store, &store[16], in0, in1);
-      break;
-    }
-    default: { assert(0); }
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  // Each array represents an 8x32 block
-  __m128i col0[32], col1[32];
-  // This array represents a 16x16 block
-  __m128i temp[32];
-
-  // Load input data. Only need to load the top left 16x16 block.
-  load_buffer_16x16(input, col0, col1);
-
-  // columns
-  array_transpose_16x16(col0, col1);
-  idct32_135(col0, col1);
-
-  // rows
-  transpose_and_copy_16x16(col0, col1, temp, left_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest, stride);
-
-  transpose_and_copy_16x16(col0, col1, temp, right_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest + 16, stride);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[16]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-    const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-    const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-    const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[16]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-    const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-    const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-    const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-    const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-    const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-    const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-    const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  out[4] = _mm_sub_epi16(u19, u20);
-  out[5] = _mm_sub_epi16(u18, u21);
-  out[6] = _mm_sub_epi16(u17, u22);
-  out[7] = _mm_sub_epi16(u16, u23);
-
-  out[8] = _mm_sub_epi16(u31, u24);
-  out[9] = _mm_sub_epi16(u30, u25);
-  out[10] = _mm_sub_epi16(u29, u26);
-  out[11] = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_full_8x32_quarter_1(in, temp);
-  idct32_full_8x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
-                             __m128i *out /*out[32]*/) {
-  __m128i temp[32];
-  idct32_full_8x32_quarter_1_2(in, temp);
-  idct32_full_8x32_quarter_3_4(in, &temp[16]);
-  add_sub_butterfly(temp, out, 32);
-}
-
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
-  int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = load_input_data(input);
-    in[i + 8] = load_input_data(input + 8);
-    in[i + 16] = load_input_data(input + 16);
-    in[i + 24] = load_input_data(input + 24);
-    input += 32;
-  }
-}
-
-void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                  int stride) {
-  __m128i col[128], in[32];
-  int i, j;
-
-  // rows
-  for (i = 0; i < 4; ++i) {
-    load_buffer_8x32(input, in);
-    input += 32 << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    idct32_full_8x32(in, col + (i << 5));
-  }
-
-  // columns
-  for (i = 0; i < 4; ++i) {
-    j = i << 3;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    idct32_full_8x32(in, in);
-    store_buffer_8x32(in, dest, stride);
-    dest += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
index f0668e6f3..0bc841a7a 100644
--- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -85,15 +85,10 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_HIGHBITDEPTH
   mova            m0,        [inputq +  0]
   packssdw        m0,        [inputq + 16]
   mova            m1,        [inputq + 32]
   packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
new file mode 100644
index 000000000..c3c88245a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 4);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 4) {
+    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    x0 = xx_loadl_32(b + 0 * b_stride);
+    x1 = xx_loadl_32(b + 1 * b_stride);
+    x2 = xx_loadl_32(b + 2 * b_stride);
+    x3 = xx_loadl_32(b + 3 * b_stride);
+    x_lo = _mm_unpacklo_epi32(x0, x1);
+    x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    __m128i sad4x4 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad4x4);
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 8);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 2) {
+    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+    __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+    x0 = xx_loadl_64(b + 0 * b_stride);
+    x1 = xx_loadl_64(b + 1 * b_stride);
+
+    __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+    __m128i sad8x2 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad8x2);
+
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i;
+  assert(width == 16);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    __m128i x = xx_loadu_128(a);
+    __m128i y = xx_loadu_128(b);
+
+    __m128i sad16x1 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad16x1);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 32);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 2; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad32_half = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad32_half);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 64);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 4; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 128);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 8; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+#define jnt_sadMxN_sse2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+#define jnt_sadMxN_avx2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+/* clang-format off */
+jnt_sadMxN_sse2(128, 128)
+jnt_sadMxN_sse2(128, 64)
+jnt_sadMxN_sse2(64, 128)
+jnt_sadMxN_sse2(64, 64)
+jnt_sadMxN_sse2(64, 32)
+jnt_sadMxN_sse2(32, 64)
+jnt_sadMxN_sse2(32, 32)
+jnt_sadMxN_sse2(32, 16)
+jnt_sadMxN_sse2(16, 32)
+jnt_sadMxN_sse2(16, 16)
+jnt_sadMxN_sse2(16, 8)
+jnt_sadMxN_sse2(8, 16)
+jnt_sadMxN_sse2(8, 8)
+jnt_sadMxN_sse2(8, 4)
+jnt_sadMxN_sse2(4, 8)
+jnt_sadMxN_sse2(4, 4)
+jnt_sadMxN_sse2(4, 16)
+jnt_sadMxN_sse2(16, 4)
+jnt_sadMxN_sse2(8, 32)
+jnt_sadMxN_sse2(32, 8)
+jnt_sadMxN_sse2(16, 64)
+jnt_sadMxN_sse2(64, 16)
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 000000000..9801e285c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = _mm_setzero_si128();
+
+        // avoid load undefined memory
+        if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8);
+        __m128i source = _mm_unpacklo_epi64(source_low, source_hi);
+
+        // shuffle to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
+
+static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                 int width, int height, const uint8_t *ref,
+                                 int ref_stride,
+                                 const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      const uint8_t *row0 = ref + 0 * ref_stride;
+      const uint8_t *row1 = ref + 1 * ref_stride;
+      const uint8_t *row2 = ref + 2 * ref_stride;
+      const uint8_t *row3 = ref + 3 * ref_stride;
+
+      __m128i p0 =
+          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+                        row3[0], row3[1], row3[2], row3[3]);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
+      const uint8_t *b, int b_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                        \
+    uint8_t temp2[H * W];                                                \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
+                                                                         \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
+                                                                         \
+    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
+                                jcp_param);                              \
+                                                                         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
+  }
+
+JNT_SUBPIX_AVG_VAR(128, 128)
+JNT_SUBPIX_AVG_VAR(128, 64)
+JNT_SUBPIX_AVG_VAR(64, 128)
+JNT_SUBPIX_AVG_VAR(64, 64)
+JNT_SUBPIX_AVG_VAR(64, 32)
+JNT_SUBPIX_AVG_VAR(32, 64)
+JNT_SUBPIX_AVG_VAR(32, 32)
+JNT_SUBPIX_AVG_VAR(32, 16)
+JNT_SUBPIX_AVG_VAR(16, 32)
+JNT_SUBPIX_AVG_VAR(16, 16)
+JNT_SUBPIX_AVG_VAR(16, 8)
+JNT_SUBPIX_AVG_VAR(8, 16)
+JNT_SUBPIX_AVG_VAR(8, 8)
+JNT_SUBPIX_AVG_VAR(8, 4)
+JNT_SUBPIX_AVG_VAR(4, 8)
+JNT_SUBPIX_AVG_VAR(4, 4)
+JNT_SUBPIX_AVG_VAR(4, 16)
+JNT_SUBPIX_AVG_VAR(16, 4)
+JNT_SUBPIX_AVG_VAR(8, 32)
+JNT_SUBPIX_AVG_VAR(32, 8)
+JNT_SUBPIX_AVG_VAR(16, 64)
+JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
index bf8150e2a..18862dd3e 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -11,13 +11,14 @@
 
 #include <immintrin.h> /* AVX2 */
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
-void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
+void aom_lpf_horizontal_16_avx2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
@@ -368,7 +369,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
                                      const unsigned char *_blimit,
                                      const unsigned char *_limit,
                                      const unsigned char *_thresh) {
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
index 8343dbbed..f1eac233b 100644
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -11,7 +11,9 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
 
@@ -19,1047 +21,1016 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-#if CONFIG_PARALLEL_DEBLOCKING
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK4                                                      \
-  do {                                                                        \
-    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
-    __m128i flat = abs_diff(q1p1, q0p0);                                      \
-    /* abs(p1 - q1), abs(p0 - q0) */                                          \
-    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
-    __m128i abs_p0q0, abs_p1q1;                                               \
-                                                                              \
-    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
-    hev =                                                                     \
-        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
-    hev = _mm_packs_epi16(hev, hev);                                          \
-                                                                              \
-    /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
-    /*                                  p1, p0, q0, q1); */                   \
-    abs_p0q0 =                                                                \
-        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
-    abs_p1q1 =                                                                \
-        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
-    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
-    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
-    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
-    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
-    mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
-    mask = _mm_cmpeq_epi8(mask, zero);                                        \
-    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
-  } while (0)
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-
-// filter_mask and hev_mask
-#define FILTER_HEV_MASK                                                       \
-  do {                                                                        \
-    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
-    __m128i flat = abs_diff(q1p1, q0p0);                                      \
-    /* abs(p1 - q1), abs(p0 - q0) */                                          \
-    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
-    __m128i abs_p0q0, abs_p1q1, work;                                         \
-                                                                              \
-    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
-    hev =                                                                     \
-        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
-    hev = _mm_packs_epi16(hev, hev);                                          \
-                                                                              \
-    /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
-    /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
-    abs_p0q0 =                                                                \
-        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
-    abs_p1q1 =                                                                \
-        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
-    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
-    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
-    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
-    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
-    /* abs(p3 - p2), abs(p2 - p1) */                                          \
-    work = abs_diff(p3p2, p2p1);                                              \
-    flat = _mm_max_epu8(work, flat);                                          \
-    /* abs(q3 - q2), abs(q2 - q1) */                                          \
-    work = abs_diff(q3q2, q2q1);                                              \
-    flat = _mm_max_epu8(work, flat);                                          \
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
-    mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
-    mask = _mm_cmpeq_epi8(mask, zero);                                        \
-    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
-  } while (0)
-
-#define FILTER4                                                             \
-  do {                                                                      \
-    const __m128i t3t4 =                                                    \
-        _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
-    const __m128i t80 = _mm_set1_epi8(0x80);                                \
-    __m128i filter, filter2filter1, work;                                   \
-                                                                            \
-    ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
-    qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
-                                                                            \
-    /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
-    work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
-    filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
-    /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
-    filter = _mm_subs_epi8(filter, work);                                   \
-    filter = _mm_subs_epi8(filter, work);                                   \
-    filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
-    filter = _mm_and_si128(filter, mask); /* & mask */                      \
-    filter = _mm_unpacklo_epi64(filter, filter);                            \
-                                                                            \
-    /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
-    /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
-    filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
-    filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
-    filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
-    filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
-    filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
-    filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
-                                                                            \
-    /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
-    filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
-    filter = _mm_unpacklo_epi8(filter, filter);                             \
-    filter = _mm_srai_epi16(filter, 9); /* round */                         \
-    filter = _mm_packs_epi16(filter, filter);                               \
-    filter = _mm_andnot_si128(hev, filter);                                 \
-                                                                            \
-    hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
-    filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
-                                                                            \
-    /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
-    qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
-    /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
-    ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
-    qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
-    ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
-  } while (0)
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                          __m128i *hev, __m128i *mask,
+                                          __m128i *qs1qs0, __m128i *ps1ps0) {
+  const __m128i t3t4 =
+      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8(0x80);
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi64(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);
+  filter = _mm_srai_epi16(filter, 9); /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+
+  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  __m128i flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+
+  /* const int8_t mask = filter_mask2(*limit, *blimit, */
+  /*                                  p1, p0, q0, q1); */
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi64(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
 
 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
                                const uint8_t *_blimit, const uint8_t *_limit,
                                const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3p2, p2p1, q3q2, q2q1;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-#if !CONFIG_PARALLEL_DEBLOCKING
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-#if !CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK;
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK4;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  FILTER4;
-
-#if CONFIG_PARALLEL_DEBLOCKING
-  *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 8);
-  *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0);
-
-  *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 8);
-  *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0);
-#else
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
-#endif
+
+  __m128i qs1qs0, ps1ps0;
+  __m128i p1, p0, q0, q1;
+
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+  xx_storel_32(s - 1 * p, ps1ps0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
+  xx_storel_32(s + 0 * p, qs1qs0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
 }
 
 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
                              const uint8_t *_blimit, const uint8_t *_limit,
                              const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
+  __m128i p1p0, q1q0;
+  __m128i p1, p0, q0, q1;
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
   __m128i x0, x1, x2, x3;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3p2, p2p1, q3q2, q2q1;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i mask, hev;
+  __m128i d0, d1, d2, d3;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
 
-  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
-
-  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
-
-  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
-
-  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
-
-  // Transpose 8x8
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
-  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  x0 = _mm_unpacklo_epi16(x2, x3);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
-
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
-  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  x2 = _mm_unpackhi_epi16(x2, x3);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
-
-  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
-  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
-  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-#if !CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK;
-#else   // CONFIG_PARALLEL_DEBLOCKING
-  FILTER_HEV_MASK4;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  FILTER4;
+  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
 
   // Transpose 8x4 to 4x8
-  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
-  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
-  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
-  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
-  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
-  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
-  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
-#endif
-  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
-
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
-#endif
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
 }
 
-static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num,
-                                       uint8_t *s) {
-#if CONFIG_PARALLEL_DEBLOCKING
-  *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x);
-  const __m128i hi = _mm_srli_si128(*x, 8);
-  *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi);
-#else
-  _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x);
-  _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x));
-#endif
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+  xx_storel_32(s - (num + 1) * p, x);
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
 }
 
-void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit,
-                                    const unsigned char *_limit,
-                                    const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
   __m128i mask, hev, flat, flat2;
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
   __m128i abs_p1p0;
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
-  q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
 
   {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    __m128i fe, ff, work;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     fe = _mm_set1_epi8(0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, *thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
 
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
   }
 
-  // lp filter
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+  // loopfilter done
+
+  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+  {
+    __m128i work;
+    flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // flat and wide flat calculations
   {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi16(0x1);
-    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
-    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
-    __m128i qs0 = _mm_xor_si128(p0q0, t80);
-    __m128i qs1 = _mm_xor_si128(p1q1, t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, qs0ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 0xB);
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 0xB);
-
-    // Filter1 >> 3
-    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
-    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi16(filter1, t1);
-    filt = _mm_srai_epi16(filt, 1);
-    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
-                            filt);
-    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
-    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
-    // loopfilter done
-
-    {
-      __m128i work;
-      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-      flat = _mm_max_epu8(abs_p1p0, flat);
-      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-      flat = _mm_subs_epu8(flat, one);
-      flat = _mm_cmpeq_epi8(flat, zero);
-      flat = _mm_and_si128(flat, mask);
-
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
-      q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
-
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
-      q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
-      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
-
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
-      q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
-      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
-      flat2 = _mm_max_epu8(work, flat2);
-      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-      flat2 = _mm_subs_epu8(flat2, one);
-      flat2 = _mm_cmpeq_epi8(flat2, zero);
-      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i four = _mm_set1_epi16(4);
-      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-      __m128i pixelFilter_p, pixelFilter_q;
-      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
-
-      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
-      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
-      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
-      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
-      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
-      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
-      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
-      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
-      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
-      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
-      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
-      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
-      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
-      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
-      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
-      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
-
-      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
-                                    _mm_add_epi16(p4_16, p3_16));
-      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
-                                    _mm_add_epi16(q4_16, q3_16));
-
-      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-      pixelFilter_p =
-          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-      pixetFilter_p2p1p0 = _mm_add_epi16(
-          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
-      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(p7_16, p7_16);
-      sum_q7 = _mm_add_epi16(q7_16, q7_16);
-      sum_p3 = _mm_add_epi16(p3_16, p3_16);
-      sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
-      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
-      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
-      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    flat = _mm_shuffle_epi32(flat, 68);
-    flat2 = _mm_shuffle_epi32(flat2, 68);
-
-    q2p2 = _mm_andnot_si128(flat, q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    q6p6 = _mm_andnot_si128(flat2, q6p6);
-    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
-    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    store_buffer_horz_8(&q6p6, p, 6, s);
-
-    q5p5 = _mm_andnot_si128(flat2, q5p5);
-    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    store_buffer_horz_8(&q5p5, p, 5, s);
-
-    q4p4 = _mm_andnot_si128(flat2, q4p4);
-    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    store_buffer_horz_8(&q4p4, p, 4, s);
-
-    q3p3 = _mm_andnot_si128(flat2, q3p3);
-    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    store_buffer_horz_8(&q3p3, p, 3, s);
-
-    q2p2 = _mm_andnot_si128(flat2, q2p2);
-    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    store_buffer_horz_8(&q2p2, p, 2, s);
-
-    q1p1 = _mm_andnot_si128(flat2, q1p1);
-    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    store_buffer_horz_8(&q1p1, p, 1, s);
-
-    q0p0 = _mm_andnot_si128(flat2, q0p0);
-    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    store_buffer_horz_8(&q0p0, p, 0, s);
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pixelFilter_p, pixelFilter_q;
+    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+    __m128i sum_p6, sum_q6;
+    __m128i sum_p3, sum_q3, res_p, res_q;
+
+    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+    pixelFilter_p =
+        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+    pixetFilter_p2p1p0 = _mm_add_epi16(
+        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+                                    _mm_add_epi16(p1_16, q0_16))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+                                    _mm_add_epi16(p0_16, q1_16))),
+        4);
+    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(p6_16, p6_16);
+    sum_q6 = _mm_add_epi16(q6_16, q6_16);
+    sum_p3 = _mm_add_epi16(p3_16, p3_16);
+    sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+        4);
+    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+        4);
+    flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+        4);
+    flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+        4);
+    flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+    sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+        4);
+    flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
   }
-}
+  // wide flat
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-static INLINE __m128i filter_add2_sub2(const __m128i *const total,
-                                       const __m128i *const a1,
-                                       const __m128i *const a2,
-                                       const __m128i *const s1,
-                                       const __m128i *const s2) {
-  __m128i x = _mm_add_epi16(*a1, *total);
-  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
-  return x;
-}
+  flat = _mm_shuffle_epi32(flat, 68);
+  flat2 = _mm_shuffle_epi32(flat2, 68);
+
+  *q2p2 = _mm_andnot_si128(flat, *q2p2);
+  flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+  *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+  qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+  flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+  *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+  qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+  flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+  *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+  *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+  flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+  *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+  *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+  flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+  *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+  *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+  flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+  *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+  *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+  flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+  *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
 
-static INLINE __m128i filter8_mask(const __m128i *const flat,
-                                   const __m128i *const other_filt,
-                                   const __m128i *const f8_lo,
-                                   const __m128i *const f8_hi) {
-  const __m128i f8 =
-      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
-  const __m128i result = _mm_and_si128(*flat, f8);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+  *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+  flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+  *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+  *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+  flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+  *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
 }
 
-static INLINE __m128i filter16_mask(const __m128i *const flat,
-                                    const __m128i *const other_filt,
-                                    const __m128i *const f_lo,
-                                    const __m128i *const f_hi) {
-  const __m128i f =
-      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
-  const __m128i result = _mm_and_si128(*flat, f);
-  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  store_buffer_horz_8(q0p0, p, 0, s);
+  store_buffer_horz_8(q1p1, p, 1, s);
+  store_buffer_horz_8(q2p2, p, 2, s);
+  store_buffer_horz_8(q3p3, p, 3, s);
+  store_buffer_horz_8(q4p4, p, 4, s);
+  store_buffer_horz_8(q5p5, p, 5, s);
 }
 
-typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
 
-static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
-                                        int p, int offset, uint8_t *s) {
-  int i;
-  if (pixel_num == FOUR_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]);
-    }
-  }
-  if (pixel_num == EIGHT_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]);
-    }
-  }
-  if (pixel_num == SIXTEEN_PIXELS) {
-    for (i = 13; i >= 0; i--) {
-      _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]);
-    }
-  }
-}
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
 
-static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
-                                             unsigned char *s, int p,
-                                             const unsigned char *_blimit,
-                                             const unsigned char *_limit,
-                                             const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-  __m128i mask, hev, flat, flat2;
-  __m128i p7, p6, p5;
-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
-  __m128i q5, q6, q7;
-
-  __m128i op2, op1, op0, oq0, oq1, oq2;
-
-  __m128i max_abs_p1p0q1q0;
-
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   {
-    const __m128i abs_p1p0 = abs_diff(p1, p0);
-    const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-    __m128i abs_p0q0 = abs_diff(p0, q0);
-    __m128i abs_p1q1 = abs_diff(p1, q1);
-    __m128i work;
-    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+
+    work = abs_diff(q2p2, q1p1);
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
-  }
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
 
-  {
-    __m128i work;
-    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
-    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
-    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
-    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
   }
 
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // filter4
+  // 5 tap filter
   {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
-
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    op1 = _mm_xor_si128(p1, t80);
-    op0 = _mm_xor_si128(p0, t80);
-    oq0 = _mm_xor_si128(q0, t80);
-    oq1 = _mm_xor_si128(q1, t80);
-
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
-
-    work_a = _mm_subs_epi8(oq0, op0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-    filt = _mm_andnot_si128(hev, filt);
-    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
-    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
-    // loopfilter done
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // filter8
-    {
-      const __m128i four = _mm_set1_epi16(4);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      __m128i f8_lo, f8_hi;
-
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
-                            _mm_add_epi16(p3_lo, p2_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
-                            _mm_add_epi16(p2_lo, p1_lo));
-      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
-
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
-                            _mm_add_epi16(p3_hi, p2_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
-                            _mm_add_epi16(p2_hi, p1_hi));
-      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
-
-      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
-      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
-      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
-      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
-      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
-
-      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
-      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
-      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
-    }
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // wide flat calculations
-    {
-      const __m128i eight = _mm_set1_epi16(8);
-      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
-      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
-      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
-      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
-      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
-      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
-      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
-      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
-      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
-      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
-      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
-      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
-      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
-      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
-      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
-      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
-
-      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
-      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
-      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
-      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
-      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
-      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
-      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
-      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
-      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
-      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
-      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
-      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
-      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
-      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
-      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
-      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
-
-      __m128i f_lo;
-      __m128i f_hi;
-
-      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
-      f_lo =
-          _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
-                           _mm_add_epi16(p2_lo, p1_lo));
-      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
-      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
-
-      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
-      f_hi =
-          _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
-                           _mm_add_epi16(p2_hi, p1_hi));
-      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
-      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
-
-      __m128i x[14];
-      x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
-      x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
-      x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
-      x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
-      x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
-      x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
-      x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
-      x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
-      x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
-      x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
-      x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
-      x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
-      x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-
-      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
-      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
-      x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-
-      store_buffer_horz_16(pixel_num, x, p, 6, s);
-    }
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_add_epi16(q1_16, q2_16);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
   }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  *q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  *p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
 }
 
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_a, op2, oq2;
 
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-  p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
   {
     // filter_mask and hev_mask
+
+    // considering sse doesn't have unsigned elements comparison the idea is to
+    // find at least one case when X > limit, it means the corresponding  mask
+    // bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
     const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
     abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
 
-    abs_p0q0 = abs_diff(q0p0, p0q0);
-    abs_p1q1 = abs_diff(q1p1, p1q1);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, *thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -1067,424 +1038,215 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
     // mask |= (abs(q1 - q0) > limit) * -1;
 
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, *limit);
     mask = _mm_cmpeq_epi8(mask, zero);
+    // replicate for the further "merged variables" usage
+    mask = _mm_unpacklo_epi64(mask, mask);
 
     // flat_mask4
 
     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     flat = _mm_max_epu8(abs_p1p0, flat);
+
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
   }
 
+  // filter8
   {
     const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-    }
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    filter1 = _mm_unpacklo_epi8(zero, filter1);
-    filter1 = _mm_srai_epi16(filter1, 11);
-    filter1 = _mm_packs_epi16(filter1, filter1);
-
-    // Filter2 >> 3
-    filter2 = _mm_unpacklo_epi8(zero, filter2);
-    filter2 = _mm_srai_epi16(filter2, 11);
-    filter2 = _mm_packs_epi16(filter2, zero);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    filt = _mm_unpacklo_epi8(zero, filt);
-    filt = _mm_srai_epi16(filt, 9);
-    filt = _mm_packs_epi16(filt, zero);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-#if CONFIG_PARALLEL_DEBLOCKING
-    *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2);
-    *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1);
-    *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0);
-    *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0);
-    *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1);
-    *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2);
-#else
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-#endif
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    op2 = _mm_packus_epi16(workp_shft0, workp_shft0);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    oq2 = _mm_packus_epi16(workp_shft1, workp_shft1);
   }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+
+  qs1qs0 = _mm_andnot_si128(flat, qs1qs0);
+  q1q0 = _mm_and_si128(flat, flat_q0q1);
+  *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+  ps1ps0 = _mm_andnot_si128(flat, ps1ps0);
+  p1p0 = _mm_and_si128(flat, flat_p1p0);
+  *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+  work_a = _mm_andnot_si128(flat, *q2);
+  q2_16 = _mm_and_si128(flat, oq2);
+  *q2_out = _mm_or_si128(work_a, q2_16);
+
+  work_a = _mm_andnot_si128(flat, *p2);
+  p2_16 = _mm_and_si128(flat, op2);
+  *p2_out = _mm_or_si128(work_a, p2_16);
 }
 
-void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-#if CONFIG_PARALLEL_DEBLOCKING
-  lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
-#else
-  lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh);
-#endif
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0, p2_out, q2_out;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8));
+  xx_storel_32(s - 3 * p, p2_out);
+  xx_storel_32(s + 2 * p, q2_out);
 }
 
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
                          _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
                          _mm_load_si128((const __m128i *)_thresh1));
 
-  __m128i mask, hev, flat;
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-    __m128i work;
+  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
 
-    // filter_mask and hev_mask
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
 
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
+  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
 
-    // flat_mask4
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
-        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
-        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-  }
-  {
-    const __m128i four = _mm_set1_epi16(4);
-    unsigned char *src = s;
-    int i = 0;
-
-    do {
-      __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
-
-      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
-                       _mm_packus_epi16(workp_shft, workp_shft));
-
-      src += 8;
-    } while (++i < 2);
-  }
-  // lp filter
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_load_si128((__m128i *)flat_oq0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q0 = _mm_and_si128(flat, q0);
-    q0 = _mm_or_si128(work_a, q0);
-
-    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_load_si128((__m128i *)flat_oq1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q1 = _mm_and_si128(flat, q1);
-    q1 = _mm_or_si128(work_a, q1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_load_si128((__m128i *)flat_oq2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    q2 = _mm_and_si128(flat, q2);
-    q2 = _mm_or_si128(work_a, q2);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_load_si128((__m128i *)flat_op0);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p0 = _mm_and_si128(flat, p0);
-    p0 = _mm_or_si128(work_a, p0);
-
-    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_load_si128((__m128i *)flat_op1);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p1 = _mm_and_si128(flat, p1);
-    p1 = _mm_or_si128(work_a, p1);
-
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_load_si128((__m128i *)flat_op2);
-    work_a = _mm_andnot_si128(flat, work_a);
-    p2 = _mm_and_si128(flat, p2);
-    p2 = _mm_or_si128(work_a, p2);
-
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-  }
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0, p2_out, q2_out;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &p2_out, &q2_out, &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out);
 }
 
 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
@@ -1494,449 +1256,405 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit1,
                                     const unsigned char *_limit1,
                                     const unsigned char *_thresh1) {
+  __m128i p1, p0, q0, q1;
+  __m128i qs1qs0, ps1ps0;
+
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
                          _mm_load_si128((const __m128i *)_blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
                          _mm_load_si128((const __m128i *)_limit1));
-  const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p3, p2, q2, q3;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  __m128i p1, p0, q0, q1;
-  __m128i mask, hev, flat;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-#if !CONFIG_PARALLEL_DEBLOCKING
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // filter_mask and hev_mask
-  {
-    const __m128i abs_p1p0 =
-        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 =
-        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 =
-        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 =
-        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
-#if !CONFIG_PARALLEL_DEBLOCKING
-    __m128i work;
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
-#if !CONFIG_PARALLEL_DEBLOCKING
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
-        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
-    mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(
-        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
-        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-    mask = _mm_subs_epu8(mask, limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
 
-  // filter4
-  {
-    const __m128i t4 = _mm_set1_epi8(4);
-    const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
-    const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
-    const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-    const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-    const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-    const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
-    __m128i filt;
-    __m128i work_a;
-    __m128i filter1, filter2;
-
-    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
-    work_a = _mm_subs_epi8(qs0, ps0);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    filt = _mm_adds_epi8(filt, work_a);
-    // (aom_filter + 3 * (qs0 - ps0)) & mask
-    filt = _mm_and_si128(filt, mask);
-
-    filter1 = _mm_adds_epi8(filt, t4);
-    filter2 = _mm_adds_epi8(filt, t3);
-
-    // Filter1 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
-
-    // Filter2 >> 3
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
-
-    // filt >> 1
-    filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
-
-    filt = _mm_andnot_si128(hev, filt);
-
-    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  }
-}
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
 
-static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
-                                 int in_p, unsigned char *out, int out_p) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
-
-  // 2-way interleave w/hoisting of unpacks
-  x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
-  x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
-
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
-  x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
-
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
-  x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
-
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
-  x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
-  x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
-
-  x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
-  x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
-  x5 = _mm_unpacklo_epi16(x2, x3);                // 10
-
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
-  x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
-
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
-  x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
-  x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
-
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
-  x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
-  x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
-
-  x6 = _mm_unpacklo_epi32(x4, x5);     // 13
-  x7 = _mm_unpackhi_epi32(x4, x5);     // 14
-  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
-  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
 
-  // Store first 4-line result
-  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
 
-  x4 = _mm_unpackhi_epi16(x0, x1);
-  x5 = _mm_unpackhi_epi16(x2, x3);
-  x12 = _mm_unpackhi_epi16(x8, x9);
-  x13 = _mm_unpackhi_epi16(x10, x11);
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
-  // Store second 4-line result
-  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
-  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
-  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
 }
 
-#if CONFIG_PARALLEL_DEBLOCKING
-#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
-#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
-#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
-#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
-#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
-#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
-enum { ROTATE_DWORD_RIGHT = 0x39 };
-static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
-                                 const uint8_t *pSrc,
-                                 const ptrdiff_t srcStride) {
-  for (uint32_t idx = 0; idx < 2; idx += 1) {
-    __m128i r0, r1, r2, r3;
-    // load data
-    r0 = movq(pSrc);
-    r1 = movq(pSrc + srcStride);
-    r2 = movq(pSrc + srcStride * 2);
-    r3 = movq(pSrc + srcStride * 3);
-    // transpose
-    r0 = punpcklbw(r0, r1);
-    r2 = punpcklbw(r2, r3);
-    r1 = punpckhwd(r0, r2);
-    r0 = punpcklwd(r0, r2);
-    // store data
-    movd(pDst, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 2, r0);
-    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 3, r0);
-    movd(pDst + dstStride * 4, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 5, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 6, r1);
-    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
-    movd(pDst + dstStride * 7, r1);
-    // advance the pointers
-    pDst += dstStride * 8;
-    pSrc += 8;
-  }
-}
-
-#endif  // CONFIG_PARALLEL_DEBLOCKING
-static INLINE void transpose(unsigned char *src[], int in_p,
-                             unsigned char *dst[], int out_p,
-                             int num_8x8_to_transpose) {
-  int idx8x8 = 0;
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i p0, q0, q1, p1;
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    unsigned char *in = src[idx8x8];
-    unsigned char *out = dst[idx8x8];
-
-    x0 =
-        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    x1 =
-        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-    x0 = _mm_unpacklo_epi8(x0, x1);
-
-    x2 =
-        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    x3 =
-        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-    x1 = _mm_unpacklo_epi8(x2, x3);
-
-    x4 =
-        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    x5 =
-        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-    x2 = _mm_unpacklo_epi8(x4, x5);
-
-    x6 =
-        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    x7 =
-        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-    x3 = _mm_unpacklo_epi8(x6, x7);
-
-    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-    x4 = _mm_unpacklo_epi16(x0, x1);
-    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-    x5 = _mm_unpacklo_epi16(x2, x3);
-    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0 * out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1 * out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2 * out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3 * out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
-
-    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi16(x0, x1);
-    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi16(x2, x3);
-    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-    x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4 * out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5 * out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
-    _mm_storel_pd((double *)(out + 6 * out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7 * out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
-}
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i qs1qs0, ps1ps0;
 
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  unsigned char *src[2];
-  unsigned char *dst[2];
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
 
-  // Loop filtering
-  aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-#if !CONFIG_PARALLEL_DEBLOCKING
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
-
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
-#else  // CONFIG_PARALLEL_DEBLOCKING
-  transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-}
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
 
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *blimit,
-                             const unsigned char *limit,
-                             const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
-  unsigned char *src[1];
-  unsigned char *dst[1];
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
 
-  // Transpose 8x8
-  src[0] = s - 4;
-  dst[0] = t_dst;
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
 
-  transpose(src, p, dst, 8, 1);
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
 
-  // Loop filtering
-  aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
+  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
 
-  src[0] = t_dst;
-  dst[0] = s - 4;
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+                        &q1);
 
-  // Transpose back
-  transpose(src, 8, dst, p, 1);
-}
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
 
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
+  p1 = _mm_srli_si128(ps1ps0, 8);
+  q1 = _mm_srli_si128(qs1qs0, 8);
 
-  // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+                        &d5, &d6, &d7);
 
-  // Loop filtering
-  aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
-  src[0] = t_dst;
-  src[1] = t_dst + 8;
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
 
-  dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x2, x1, x0, x3;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+
+  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
 
-  // Transpose back
-  transpose(src, 16, dst, p, 2);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+                        &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
 }
 
-void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
-                              const unsigned char *blimit,
-                              const unsigned char *limit,
-                              const unsigned char *thresh) {
-  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
-  unsigned char *src[2];
-  unsigned char *dst[2];
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  __m128i p2, p0, q0, q2;
+  __m128i x2, x1, x0, x3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+  // Loop filtering
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2,
+                      &q2, &blimit, &limit, &thresh);
 
-  src[0] = s - 8;
-  src[1] = s;
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 8;
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
 
-  // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1,
+                        &d2, &d3);
 
-  // Loop filtering
-  aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
 
-  src[0] = t_dst;
-  src[1] = t_dst + 8 * 8;
-  dst[0] = s - 8;
-  dst[1] = s;
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
 
-  // Transpose back
-  transpose(src, 8, dst, p, 2);
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d1, d3, d5, d7;
+  __m128i q1q0, p1p0;
+  __m128i p2, p1, q1, q2;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0,
+                      &p1p0, &p2, &q2, &blimit, &limit, &thresh);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3,
+                    &d4d5, &d6d7);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
 }
 
-void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
-
-  // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+                              const unsigned char *_blimit,
+                              const unsigned char *_limit,
+                              const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3, x2, x1, x0;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+  __m128i q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i p0_out, p1_out, p2_out, p3_out;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6,
+                        &p7);
+
+  x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6,
+                        &q7);
+
+  q6p6 = _mm_unpacklo_epi64(p1, q6);
+  q5p5 = _mm_unpacklo_epi64(p2, q5);
+  q4p4 = _mm_unpacklo_epi64(p3, q4);
+  q3p3 = _mm_unpacklo_epi64(p4, q3);
+  q2p2 = _mm_unpacklo_epi64(p5, q2);
+  q1p1 = _mm_unpacklo_epi64(p6, q1);
+  q0p0 = _mm_unpacklo_epi64(p7, q0);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                        &p0_out, &p1_out, &p2_out, &p3_out);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2,
+                        &q3);
+
+  _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out);
+  _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out);
+
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), q3);
+}
 
-  // Loop filtering
-  aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+  __m128i q0, q1, q2, q3, q7;
+  __m128i p0p1, p2p3, p4p5, p6p7;
+
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
 
-  // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+  q7 = _mm_srli_si128(d14d15, 8);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
 }
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
index 027c890dc..c6b6469b4 100644
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -14,117 +14,202 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-
-static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
-                                    int out_p, int num_8x8_to_transpose) {
-  int idx8x8 = 0;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
-  do {
-    uint16_t *in = src[idx8x8];
-    uint16_t *out = dst[idx8x8];
-
-    p0 =
-        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
-    p1 =
-        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
-    p2 =
-        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
-    p3 =
-        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
-    p4 =
-        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
-    p5 =
-        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
-    p6 =
-        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
-    p7 =
-        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
-    // 00 10 01 11 02 12 03 13
-    x0 = _mm_unpacklo_epi16(p0, p1);
-    // 20 30 21 31 22 32 23 33
-    x1 = _mm_unpacklo_epi16(p2, p3);
-    // 40 50 41 51 42 52 43 53
-    x2 = _mm_unpacklo_epi16(p4, p5);
-    // 60 70 61 71 62 72 63 73
-    x3 = _mm_unpacklo_epi16(p6, p7);
-    // 00 10 20 30 01 11 21 31
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 40 50 60 70 41 51 61 71
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 00 10 20 30 40 50 60 70
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 01 11 21 31 41 51 61 71
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
-    // 00 10 20 30 40 50 60 70
-    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
-    // 01 11 21 31 41 51 61 71
-
-    // 02 12 22 32 03 13 23 33
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 42 52 62 72 43 53 63 73
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 02 12 22 32 42 52 62 72
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
-    // 02 12 22 32 42 52 62 72
-    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
-    // 03 13 23 33 43 53 63 73
-
-    // 04 14 05 15 06 16 07 17
-    x0 = _mm_unpackhi_epi16(p0, p1);
-    // 24 34 25 35 26 36 27 37
-    x1 = _mm_unpackhi_epi16(p2, p3);
-    // 44 54 45 55 46 56 47 57
-    x2 = _mm_unpackhi_epi16(p4, p5);
-    // 64 74 65 75 66 76 67 77
-    x3 = _mm_unpackhi_epi16(p6, p7);
-    // 04 14 24 34 05 15 25 35
-    x4 = _mm_unpacklo_epi32(x0, x1);
-    // 44 54 64 74 45 55 65 75
-    x5 = _mm_unpacklo_epi32(x2, x3);
-    // 04 14 24 34 44 54 64 74
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 05 15 25 35 45 55 65 75
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
-    // 04 14 24 34 44 54 64 74
-    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
-    // 05 15 25 35 45 55 65 75
-
-    // 06 16 26 36 07 17 27 37
-    x4 = _mm_unpackhi_epi32(x0, x1);
-    // 46 56 66 76 47 57 67 77
-    x5 = _mm_unpackhi_epi32(x2, x3);
-    // 06 16 26 36 46 56 66 76
-    x6 = _mm_unpacklo_epi64(x4, x5);
-    // 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi64(x4, x5);
-
-    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
-    // 06 16 26 36 46 56 66 76
-    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
-    // 07 17 27 37 47 57 67 77
-  } while (++idx8x8 < num_8x8_to_transpose);
+#include "config/aom_config.h"
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *d0, __m128i *d1,
+                                            __m128i *d2, __m128i *d3,
+                                            __m128i *d4, __m128i *d5) {
+  __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+  // 00 01 02 03 04 05 xx xx
+  // 10 11 12 13 14 15 xx xx
+  // 20 21 22 23 24 25 xx xx
+  // 30 31 32 33 34 35 xx xx
+  // 40 41 42 43 44 45 xx xx
+  // 50 51 52 53 54 55 xx xx
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
+  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
+  *d1 = _mm_unpackhi_epi64(ww0,
+                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  *d2 = _mm_unpacklo_epi64(ww0,
+                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
+
+  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
+  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
+  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
+
+  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
+
+  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
+  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
+  *d5 = _mm_unpackhi_epi64(ww0,
+                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                                    __m128i *x2, __m128i *x3,
+                                                    __m128i *d0, __m128i *d1,
+                                                    __m128i *d2, __m128i *d3) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+
+  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
+  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
+  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
+  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+                                                     __m128i *x2, __m128i *x3,
+                                                     __m128i *d4, __m128i *d5,
+                                                     __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, ww2, ww3;
+  __m128i zero = _mm_setzero_si128();
+
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+
+  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
+  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
+
+  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
+  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
+  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
+  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
 }
 
-static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
-                                        uint16_t *out, int out_p) {
-  uint16_t *src0[1];
-  uint16_t *src1[1];
-  uint16_t *dest0[1];
-  uint16_t *dest1[1];
-  src0[0] = in0;
-  src1[0] = in1;
-  dest0[0] = out;
-  dest1[0] = out + 8;
-  highbd_transpose(src0, in_p, dest0, out_p, 1);
-  highbd_transpose(src1, in_p, dest1, out_p, 1);
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3,
+                                                __m128i *d4, __m128i *d5,
+                                                __m128i *d6, __m128i *d7) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
 }
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *x4, __m128i *x5,
+                                                __m128i *x6, __m128i *x7,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+                                                 __m128i *x2, __m128i *x3,
+                                                 __m128i *x4, __m128i *x5,
+                                                 __m128i *x6, __m128i *x7,
+                                                 __m128i *d4, __m128i *d5,
+                                                 __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
+  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
 #endif  // _AOM_DSP_X86_LPF_COMMON_X86_H
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 2536f91d2..1f42eec2f 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -12,8 +12,9 @@
 #include <stdio.h>
 #include <tmmintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
@@ -75,11 +76,9 @@ static INLINE unsigned int masked_sad4xh_ssse3(
                                  ref_stride, msk, msk_stride, n);             \
   }
 
-#if CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(128, 128)
 MASKSADMXN_SSSE3(128, 64)
 MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 MASKSADMXN_SSSE3(64, 64)
 MASKSADMXN_SSSE3(64, 32)
 MASKSADMXN_SSSE3(32, 64)
@@ -93,18 +92,12 @@ MASKSAD8XN_SSSE3(8)
 MASKSAD8XN_SSSE3(4)
 MASKSAD4XN_SSSE3(8)
 MASKSAD4XN_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 MASKSAD4XN_SSSE3(16)
 MASKSADMXN_SSSE3(16, 4)
 MASKSAD8XN_SSSE3(32)
 MASKSADMXN_SSSE3(32, 8)
 MASKSADMXN_SSSE3(16, 64)
 MASKSADMXN_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(32, 128)
-MASKSADMXN_SSSE3(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                             int src_stride,
@@ -239,7 +232,6 @@ static INLINE unsigned int masked_sad4xh_ssse3(
   return (sad + 31) >> 6;
 }
 
-#if CONFIG_HIGHBITDEPTH
 // For width a multiple of 8
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -277,11 +269,9 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
                                         ref8, ref_stride, msk, msk_stride, n); \
   }
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(128, 128)
 HIGHBD_MASKSADMXN_SSSE3(128, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HIGHBD_MASKSADMXN_SSSE3(64, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 64)
@@ -295,18 +285,12 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8)
 HIGHBD_MASKSADMXN_SSSE3(8, 4)
 HIGHBD_MASKSAD4XN_SSSE3(8)
 HIGHBD_MASKSAD4XN_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASKSAD4XN_SSSE3(16)
 HIGHBD_MASKSADMXN_SSSE3(16, 4)
 HIGHBD_MASKSADMXN_SSSE3(8, 32)
 HIGHBD_MASKSADMXN_SSSE3(32, 8)
 HIGHBD_MASKSADMXN_SSSE3(16, 64)
 HIGHBD_MASKSADMXN_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(32, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE unsigned int highbd_masked_sad_ssse3(
     const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
@@ -424,5 +408,3 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(
   int sad = _mm_cvtsi128_si32(res);
   return (sad + 31) >> 6;
 }
-
-#endif
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
index 3ffe132be..d7dbefd7d 100644
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -13,13 +13,15 @@
 #include <string.h>
 #include <tmmintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/blend.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
 #include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
 
 // For width a multiple of 16
 static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
@@ -108,11 +110,9 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
     return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
   }
 
-#if CONFIG_EXT_PARTITION
 MASK_SUBPIX_VAR_SSSE3(128, 128)
 MASK_SUBPIX_VAR_SSSE3(128, 64)
 MASK_SUBPIX_VAR_SSSE3(64, 128)
-#endif
 MASK_SUBPIX_VAR_SSSE3(64, 64)
 MASK_SUBPIX_VAR_SSSE3(64, 32)
 MASK_SUBPIX_VAR_SSSE3(32, 64)
@@ -126,18 +126,12 @@ MASK_SUBPIX_VAR8XH_SSSE3(8)
 MASK_SUBPIX_VAR8XH_SSSE3(4)
 MASK_SUBPIX_VAR4XH_SSSE3(8)
 MASK_SUBPIX_VAR4XH_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 MASK_SUBPIX_VAR4XH_SSSE3(16)
 MASK_SUBPIX_VAR_SSSE3(16, 4)
 MASK_SUBPIX_VAR8XH_SSSE3(32)
 MASK_SUBPIX_VAR_SSSE3(32, 8)
 MASK_SUBPIX_VAR_SSSE3(64, 16)
 MASK_SUBPIX_VAR_SSSE3(16, 64)
-#if CONFIG_EXT_PARTITION
-MASK_SUBPIX_VAR_SSSE3(128, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 128)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static INLINE __m128i filter_block(const __m128i a, const __m128i b,
                                    const __m128i filter) {
@@ -523,7 +517,6 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
-#if CONFIG_HIGHBITDEPTH
 // For width a multiple of 8
 static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
                                    int xoffset, int yoffset, uint16_t *dst,
@@ -695,11 +688,9 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
-#if CONFIG_EXT_PARTITION
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
-#endif
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
@@ -713,18 +704,12 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
-#if CONFIG_EXT_PARTITION_TYPES
 HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
 HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32)
-#endif
-#endif
 
 static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
                                           const __m128i filter) {
@@ -1040,4 +1025,40 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
-#endif
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                              int width, int height, const uint8_t *ref,
+                              int ref_stride, const uint8_t *mask,
+                              int mask_stride, int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  assert(height % 2 == 0);
+  int i = 0;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+                              mask + mask_stride, comp_pred + width);
+      comp_pred += (width << 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  } else {  // width == 32
+    assert(width == 32);
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
+      comp_pred += (width);
+      src0 += (stride0);
+      src1 += (stride1);
+      mask += (mask_stride);
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 000000000..dc41a8342
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+                                           const uint8_t *src1,
+                                           const uint8_t *mask, uint8_t *dst) {
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *mask,
+                                          int mask_stride) {
+  int i = 0;
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    // odd line A
+    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+    // even line B
+    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+    const __m128i ma = _mm_sub_epi8(alpha_max, a);
+    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+    const __m128i round = _mm_packus_epi16(roundA, roundB);
+    // comp_pred's stride == width == 8
+    _mm_store_si128((__m128i *)(comp_pred), round);
+    comp_pred += (8 << 1);
+    src0 += (stride0 << 1);
+    src1 += (stride1 << 1);
+    mask += (mask_stride << 1);
+    i += 2;
+  } while (i < height);
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..8b69606dd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  __m128i dst;
+  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+  return dst;
+}
+
+#endif  // AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
index 73589a32a..a3535f985 100644
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -14,7 +14,7 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
 
 static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
   v_d = _mm_hadd_epi32(v_d, v_d);
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
index 52dd508ec..0338a8c77 100644
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
@@ -24,9 +25,11 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
-                                       const int32_t *wsrc, const int32_t *mask,
-                                       const int height) {
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                                 const int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask,
+                                                 const int height) {
   const int pre_step = pre_stride - 4;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
@@ -59,11 +62,9 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
-                                        const int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, const int width,
-                                        const int height) {
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
   const int pre_step = pre_stride - width;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
@@ -119,11 +120,9 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
     }                                                          \
   }
 
-#if CONFIG_EXT_PARTITION
 OBMCSADWXH(128, 128)
 OBMCSADWXH(128, 64)
 OBMCSADWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 OBMCSADWXH(64, 64)
 OBMCSADWXH(64, 32)
 OBMCSADWXH(32, 64)
@@ -137,25 +136,22 @@ OBMCSADWXH(8, 8)
 OBMCSADWXH(8, 4)
 OBMCSADWXH(4, 8)
 OBMCSADWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 OBMCSADWXH(4, 16)
 OBMCSADWXH(16, 4)
 OBMCSADWXH(8, 32)
 OBMCSADWXH(32, 8)
 OBMCSADWXH(16, 64)
 OBMCSADWXH(64, 16)
-#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
-static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
-                                           const int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask,
-                                           const int height) {
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                                     const int pre_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - 4;
   int n = 0;
@@ -189,11 +185,9 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
-                                            const int pre_stride,
-                                            const int32_t *wsrc,
-                                            const int32_t *mask,
-                                            const int width, const int height) {
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
   const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
   const int pre_step = pre_stride - width;
   int n = 0;
@@ -250,11 +244,9 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
     }                                                             \
   }
 
-#if CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(128, 128)
 HBD_OBMCSADWXH(128, 64)
 HBD_OBMCSADWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(64, 64)
 HBD_OBMCSADWXH(64, 32)
 HBD_OBMCSADWXH(32, 64)
@@ -268,12 +260,9 @@ HBD_OBMCSADWXH(8, 8)
 HBD_OBMCSADWXH(8, 4)
 HBD_OBMCSADWXH(4, 8)
 HBD_OBMCSADWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 HBD_OBMCSADWXH(4, 16)
 HBD_OBMCSADWXH(16, 4)
 HBD_OBMCSADWXH(8, 32)
 HBD_OBMCSADWXH(32, 8)
 HBD_OBMCSADWXH(16, 64)
 HBD_OBMCSADWXH(64, 16)
-#endif
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
index 392616af3..571aa770b 100644
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -12,7 +12,8 @@
 #include <assert.h>
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom_ports/mem.h"
 #include "aom/aom_integer.h"
 
@@ -128,11 +129,9 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
     return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
   }
 
-#if CONFIG_EXT_PARTITION
 OBMCVARWXH(128, 128)
 OBMCVARWXH(128, 64)
 OBMCVARWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 OBMCVARWXH(64, 64)
 OBMCVARWXH(64, 32)
 OBMCVARWXH(32, 64)
@@ -146,24 +145,17 @@ OBMCVARWXH(8, 8)
 OBMCVARWXH(8, 4)
 OBMCVARWXH(4, 8)
 OBMCVARWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 OBMCVARWXH(4, 16)
 OBMCVARWXH(16, 4)
 OBMCVARWXH(8, 32)
 OBMCVARWXH(32, 8)
 OBMCVARWXH(16, 64)
 OBMCVARWXH(64, 16)
-#if CONFIG_EXT_PARTITION
-OBMCVARWXH(32, 128)
-OBMCVARWXH(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
 
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
 
-#if CONFIG_HIGHBITDEPTH
 static INLINE void hbd_obmc_variance_w4(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
@@ -278,8 +270,19 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
   uint64_t sse64 = 0;
   if (w == 4) {
     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
+  } else if (w < 128 || h < 128) {
     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    assert(w == 128 && h == 128);
+
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * w;
+      mask += 64 * w;
+      h -= 64;
+    } while (h > 0);
   }
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
@@ -291,28 +294,23 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
                                            unsigned int *sse, int *sum) {
   int64_t sum64 = 0;
   uint64_t sse64 = 0;
-  if (w == 128) {
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
-                            32);
-      pre8 += 32 * pre_stride;
-      wsrc += 32 * 128;
-      mask += 32 * 128;
-      h -= 32;
-    } while (h > 0);
-  } else if (w == 64 && h >= 128) {
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
-                            64);
-      pre8 += 64 * pre_stride;
-      wsrc += 64 * 64;
-      mask += 64 * 64;
-      h -= 64;
-    } while (h > 0);
-  } else if (w == 4) {
+  int max_pel_allowed_per_ovf = 512;
+  if (w == 4) {
     hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
+  } else if (w * h <= max_pel_allowed_per_ovf) {
     hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+    assert(max_pel_allowed_per_ovf % w == 0);
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            h_per_ovf);
+      pre8 += h_per_ovf * pre_stride;
+      wsrc += h_per_ovf * w;
+      mask += h_per_ovf * w;
+      h -= h_per_ovf;
+    } while (h > 0);
   }
   *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
   *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
@@ -347,11 +345,9 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-#if CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(128, 128)
 HBD_OBMCVARWXH(128, 64)
 HBD_OBMCVARWXH(64, 128)
-#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(64, 64)
 HBD_OBMCVARWXH(64, 32)
 HBD_OBMCVARWXH(32, 64)
@@ -365,16 +361,9 @@ HBD_OBMCVARWXH(8, 8)
 HBD_OBMCVARWXH(8, 4)
 HBD_OBMCVARWXH(4, 8)
 HBD_OBMCVARWXH(4, 4)
-#if CONFIG_EXT_PARTITION_TYPES
 HBD_OBMCVARWXH(4, 16)
 HBD_OBMCVARWXH(16, 4)
 HBD_OBMCVARWXH(8, 32)
 HBD_OBMCVARWXH(32, 8)
 HBD_OBMCVARWXH(16, 64)
 HBD_OBMCVARWXH(64, 16)
-#if CONFIG_EXT_PARTITION
-HBD_OBMCVARWXH(32, 128)
-HBD_OBMCVARWXH(128, 32)
-#endif  // CONFIG_EXT_PARTITION
-#endif  // CONFIG_EXT_PARTITION_TYPES
-#endif  // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
index 954a95b98..e6b40262d 100644
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -44,16 +44,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                            m0, [zbinq]              ; m0 = zbin
 
   ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers but we process them as 16 bit numbers
   mova                            m9, [coeffq]
   packssdw                        m9, [coeffq+16]          ; m9 = c[i]
   mova                           m10, [coeffq+32]
   packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-%else
-  mova                            m9, [coeffq]             ; m9 = c[i]
-  mova                           m10, [coeffq+16]          ; m10 = c[i]
-%endif
 
   mov                             r0, eobmp                ; Output pointer
   mov                             r1, qcoeffmp             ; Output pointer
@@ -76,15 +71,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   ptest                          m14, m14
   jnz .single_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova                       [r1   ], ymm5
   mova                       [r1+32], ymm5
   mova                       [r2   ], ymm5
   mova                       [r2+32], ymm5
-%else
-  mova                          [r1], ymm5
-  mova                          [r2], ymm5
-%endif
   mov                           [r0], word 0
 
   vzeroupper
@@ -124,7 +114,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pand                            m8, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -136,16 +125,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmovsxwd                       m11, m13
   mova                  [qcoeffq+32], m11
   mova                  [qcoeffq+48], m6
-%else
-  mova                  [qcoeffq   ], m8
-  mova                  [qcoeffq+16], m13
-%endif
 
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
 
-%if CONFIG_HIGHBITDEPTH
   ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -157,10 +141,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmovsxwd                       m11, m13
   mova                 [dqcoeffq+32], m11
   mova                 [dqcoeffq+48], m6
-%else
-  mova                 [dqcoeffq   ], m8
-  mova                 [dqcoeffq+16], m13
-%endif
 
   mova                            m6, [iscanq]            ; m6 = scan[i]
   mova                           m11, [iscanq+16]         ; m11 = scan[i]
@@ -229,29 +209,20 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
 
-%if CONFIG_HIGHBITDEPTH
+
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
+
   lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [coeffq+ncoeffq*4+16]
   mova                           m10, [coeffq+ncoeffq*4+32]
   packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
 
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
@@ -264,16 +235,10 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   ptest                          m14, m14
   jnz .first_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4   ], ymm5
   mova        [qcoeffq+ncoeffq*4+32], ymm5
   mova       [dqcoeffq+ncoeffq*4   ], ymm5
   mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova           [qcoeffq+ncoeffq*2], ymm5
-  mova          [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
   add                        ncoeffq, mmsize
 
   punpckhqdq                      m1, m1
@@ -302,7 +267,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pand                            m8, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -314,10 +278,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
 
 %ifidn %1, b_32x32
   pabsw                           m8, m8
@@ -333,7 +293,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
@@ -345,10 +304,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
 
   pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
@@ -363,16 +318,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
 .ac_only_loop:
 
-%if CONFIG_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [coeffq+ncoeffq*4+16]
   mova                           m10, [coeffq+ncoeffq*4+32]
   packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
 
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
@@ -385,15 +335,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   ptest                          m14, m14
   jnz .rest_nonzero
 
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4+ 0], ymm5
   mova        [qcoeffq+ncoeffq*4+32], ymm5
   mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
   mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
+
   add                        ncoeffq, mmsize
   jnz .ac_only_loop
 
@@ -424,7 +370,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pand                           m14, m7
   pand                           m13, m12
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
@@ -436,10 +381,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
 
 %ifidn %1, b_32x32
   pabsw                          m14, m14
@@ -454,7 +395,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
@@ -466,10 +406,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
   pmovsxwd                       m11, m13
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
 
   pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
@@ -510,27 +446,16 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
 
 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
 
-%if CONFIG_HIGHBITDEPTH
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
   neg                        ncoeffq
   pxor                            m7, m7
 
 .blank_loop:
-%if CONFIG_HIGHBITDEPTH
   mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
   mova       [dqcoeffq+ncoeffq*4+32], ymm7
   mova        [qcoeffq+ncoeffq*4+ 0], ymm7
   mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
 
@@ -543,5 +468,3 @@ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
 INIT_XMM avx
 QUANTIZE_FN b, 7
 QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
index 0e7f679d0..46b9c7d29 100644
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -12,7 +12,8 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
index 36b4dddbd..e2c1ebb71 100644
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -45,7 +45,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
 %endif
   mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
+  psubw                           m0, [GLOBAL(pw_1)]
   mov                             r2, shiftmp
   mov                             r3, qcoeffmp
   mova                            m4, [r2]                 ; m4 = shift
@@ -56,29 +56,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_HIGHBITDEPTH
   lea                         coeffq, [  coeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
   lea                         iscanq, [  iscanq+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
-%if CONFIG_HIGHBITDEPTH
   ; coeff stored as 32bit numbers & require 16bit numbers
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
   packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -99,7 +88,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
   pand                           m13, m12
-%if CONFIG_HIGHBITDEPTH
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                           m11, m8
   mova                            m6, m8
@@ -117,10 +106,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
+
 %ifidn %1, b_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
@@ -134,7 +120,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                          m8, m9
   psignw                         m13, m10
 %endif
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                            m11, m8
   mova                            m6, m8
@@ -152,10 +137,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -169,16 +150,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   jz .accumulate_eob
 
 .ac_only_loop:
-%if CONFIG_HIGHBITDEPTH
   ; pack coeff from 32bit to 16bit array
   mova                            m9, [  coeffq+ncoeffq*4+ 0]
   packssdw                        m9, [  coeffq+ncoeffq*4+16]
   mova                           m10, [  coeffq+ncoeffq*4+32]
   packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
+
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
@@ -201,7 +178,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
   pand                           m13, m12
-%if CONFIG_HIGHBITDEPTH
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pxor                           m11, m11
   mova                           m11, m14
@@ -220,10 +196,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+32], m11
   mova        [qcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
+
 %ifidn %1, b_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
@@ -236,7 +209,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-%if CONFIG_HIGHBITDEPTH
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   mova                           m11, m14
   mova                            m6, m14
@@ -254,10 +227,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+32], m11
   mova       [dqcoeffq+ncoeffq*4+48], m6
   pxor                            m5, m5
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
+
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -274,7 +244,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %ifidn %1, b_32x32
   jmp .accumulate_eob
 .skip_iter:
-%if CONFIG_HIGHBITDEPTH
   mova        [qcoeffq+ncoeffq*4+ 0], m5
   mova        [qcoeffq+ncoeffq*4+16], m5
   mova        [qcoeffq+ncoeffq*4+32], m5
@@ -283,12 +252,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova       [dqcoeffq+ncoeffq*4+16], m5
   mova       [dqcoeffq+ncoeffq*4+32], m5
   mova       [dqcoeffq+ncoeffq*4+48], m5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-%endif
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 %endif
@@ -313,17 +276,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
   DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_HIGHBITDEPTH
   lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
-%if CONFIG_HIGHBITDEPTH
   mova       [dqcoeffq+ncoeffq*4+ 0], m7
   mova       [dqcoeffq+ncoeffq*4+16], m7
   mova       [dqcoeffq+ncoeffq*4+32], m7
@@ -332,12 +289,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova        [qcoeffq+ncoeffq*4+16], m7
   mova        [qcoeffq+ncoeffq*4+32], m7
   mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
   add                        ncoeffq, mmsize
   jl .blank_loop
   mov                    word [eobq], 0
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
index e60f518b4..f662b62b1 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>  // AVX2
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom/aom_integer.h"
 
 void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
index 2c67f450f..55a856985 100644
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -233,11 +233,9 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
 %endmacro
 
 INIT_XMM sse2
-%if CONFIG_EXT_PARTITION
 SADNXN4D 128, 128
 SADNXN4D 128, 64
 SADNXN4D 64,  128
-%endif
 SADNXN4D 64, 64
 SADNXN4D 64, 32
 SADNXN4D 32, 64
@@ -251,11 +249,9 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
-%if CONFIG_EXT_PARTITION_TYPES
 SADNXN4D  4, 16
 SADNXN4D 16,  4
 SADNXN4D  8, 32
 SADNXN4D 32,  8
 SADNXN4D 16, 64
 SADNXN4D 64, 16
-%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
index efba61289..a50dba64a 100644
--- a/third_party/aom/aom_dsp/x86/sad_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -9,7 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
 #define FSAD64_H(h)                                                           \
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
index e8dd87a26..b506d4663 100644
--- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
@@ -11,10 +11,11 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
 #include "aom_ports/mem.h"
 
 // SAD
@@ -360,7 +361,6 @@ unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
-#if CONFIG_EXT_PARTITION
 static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
                      const uint16_t *sec_ptr, __m256i *sad_acc) {
   __m256i s[8], r[8];
@@ -471,7 +471,6 @@ unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
   sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
   return sum;
 }
-#endif  // CONFIG_EXT_PARTITION
 
 // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
 static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
@@ -649,7 +648,6 @@ unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
-#if CONFIG_EXT_PARTITION
 unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
                                            const uint8_t *ref, int ref_stride,
                                            const uint8_t *second_pred) {
@@ -697,19 +695,13 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
                                        second_pred);
   return sum;
 }
-#endif  // CONFIG_EXT_PARTITION
 
 // SAD 4D
 // Combine 4 __m256i vectors to uint32_t result[4]
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  const __m256i mask = _mm256_setr_epi32(UINT32_MAX, 0, UINT32_MAX, 0,
-                                         UINT32_MAX, 0, UINT32_MAX, 0);
-#else
-  const __m256i mask = _mm256_set1_epi64x(UINT32_MAX);
-#endif
+  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
   __m128i sad;
 
   // 8 32-bit summation
@@ -967,7 +959,6 @@ void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
   sad_array[3] = first_half[3] + second_half[3];
 }
 
-#if CONFIG_EXT_PARTITION
 void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
                                   const uint8_t *const ref_array[],
                                   int ref_stride, uint32_t *sad_array) {
@@ -1045,4 +1036,3 @@ void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
   sad_array[2] = first_half[2] + second_half[2];
   sad_array[3] = first_half[3] + second_half[3];
 }
-#endif  // CONFIG_EXT_PARTITION
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
index 4419c65b2..c6fd62c9e 100644
--- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -10,7 +10,8 @@
  */
 
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
+
+#include "config/aom_dsp_rtcd.h"
 
 static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride) {
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
index b4cc6abf1..3251b7655 100644
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -47,7 +47,6 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 %endif ; %3 == 7
 %endmacro
 
-%if CONFIG_EXT_PARTITION
 ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD128XN 1-2 0
@@ -114,7 +113,6 @@ SAD128XN 128     ; sad128x128_sse2
 SAD128XN 128, 1  ; sad128x128_avg_sse2
 SAD128XN 64      ; sad128x64_sse2
 SAD128XN 64, 1   ; sad128x64_avg_sse2
-%endif
 
 
 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
@@ -155,18 +153,14 @@ SAD128XN 64, 1   ; sad128x64_avg_sse2
 %endmacro
 
 INIT_XMM sse2
-%if CONFIG_EXT_PARTITION
 SAD64XN 128     ; sad64x128_sse2
 SAD64XN 128, 1  ; sad64x128_avg_sse2
-%endif
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD64XN 16 ; sad64x16_sse2
 SAD64XN 16, 1 ; sad64x16_avg_sse2
-%endif
 
 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
@@ -212,10 +206,8 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD32XN 8 ; sad_32x8_sse2
 SAD32XN 8, 1 ; sad_32x8_avg_sse2
-%endif
 
 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
@@ -262,12 +254,10 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD16XN 4 ; sad_16x4_sse2
 SAD16XN 4, 1 ; sad_16x4_avg_sse2
 SAD16XN 64 ; sad_16x64_sse2
 SAD16XN 64, 1 ; sad_16x64_avg_sse2
-%endif
 
 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
@@ -312,10 +302,8 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
-%if CONFIG_EXT_PARTITION_TYPES
 SAD8XN 32 ; sad_8x32_sse2
 SAD8XN 32, 1 ; sad_8x32_avg_sse2
-%endif
 
 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
@@ -361,7 +349,5 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
-%if CONFIG_EXT_PARTITION_TYPES
 SAD4XN 16 ; sad_4x16_sse2
 SAD4XN 16, 1 ; sad_4x16_avg_sse2
-%endif
diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm
deleted file mode 100644
index f6c27c855..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,377 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBAOM_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBAOM_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-;void int aom_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x16x3_sse3) PRIVATE
-sym(aom_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x8x3_sse3) PRIVATE
-sym(aom_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad8x16x3_sse3) PRIVATE
-sym(aom_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad8x8x3_sse3) PRIVATE
-sym(aom_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int aom_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad4x4x3_sse3) PRIVATE
-sym(aom_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 5e9c75845..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,362 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-;void aom_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(aom_sad16x16x8_sse4_1) PRIVATE
-sym(aom_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad16x8x8_sse4_1) PRIVATE
-sym(aom_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad8x8x8_sse4_1) PRIVATE
-sym(aom_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad8x16x8_sse4_1) PRIVATE
-sym(aom_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void aom_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(aom_sad4x4x8_sse4_1) PRIVATE
-sym(aom_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index 96b64b040..000000000
--- a/third_party/aom/aom_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,373 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int aom_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x16x3_ssse3) PRIVATE
-sym(aom_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .aom_sad16x16x3_ssse3_skiptable
-.aom_sad16x16x3_ssse3_jumptable:
-        dd .aom_sad16x16x3_ssse3_aligned_by_0  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_1  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_2  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_3  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_4  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_5  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_6  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_7  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_8  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_9  - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump
-        dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump
-.aom_sad16x16x3_ssse3_skiptable:
-
-        call .aom_sad16x16x3_ssse3_do_jump
-.aom_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of aom_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3
-
-.aom_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.aom_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int aom_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(aom_sad16x8x3_ssse3) PRIVATE
-sym(aom_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .aom_sad16x8x3_ssse3_skiptable
-.aom_sad16x8x3_ssse3_jumptable:
-        dd .aom_sad16x8x3_ssse3_aligned_by_0  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_1  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_2  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_3  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_4  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_5  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_6  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_7  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_8  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_9  - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump
-        dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump
-.aom_sad16x8x3_ssse3_skiptable:
-
-        call .aom_sad16x8x3_ssse3_do_jump
-.aom_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of aom_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3
-
-.aom_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.aom_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
index aa70106c8..6d9b5a12f 100644
--- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
+++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
@@ -47,6 +47,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
index d3feb7ec0..45bf6ec3c 100644
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -117,27 +117,26 @@ SECTION .text
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse, \
+                                          g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -155,9 +154,9 @@ SECTION .text
 
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                                y_offset, dst, dst_stride, height, sse, \
-                                g_bilin_filter, g_pw_8
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse, g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
@@ -176,25 +175,18 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                              y_offset, dst, dst_stride, height, sse
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
       %define block_height heightd
     %endif
-
     %define bilin_filter bilin_filter_m
   %endif
 %endif
@@ -374,8 +366,8 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -383,7 +375,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -400,7 +392,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -697,8 +689,8 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -706,7 +698,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -723,7 +715,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -855,8 +847,8 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -864,7 +856,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -881,7 +873,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -997,8 +989,8 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -1006,7 +998,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -1023,7 +1015,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1195,8 +1187,8 @@ SECTION .text
   STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
@@ -1209,7 +1201,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -1237,7 +1229,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
index 7bd5b23ad..1a75a234f 100644
--- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -34,10 +34,8 @@ cglobal subtract_block, 7, 7, 8, \
   je .case_16
   cmp                colsd, 32
   je .case_32
-%if CONFIG_EXT_PARTITION
   cmp                colsd, 64
   je .case_64
-%endif
 
 %macro loop16 6
   mova                  m0, [srcq+%1]
@@ -62,7 +60,6 @@ cglobal subtract_block, 7, 7, 8, \
   mova [diffq+mmsize*1+%6], m1
 %endmacro
 
-%if CONFIG_EXT_PARTITION
   mov             pred_str, pred_stridemp
 .loop_128:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
@@ -77,7 +74,6 @@ cglobal subtract_block, 7, 7, 8, \
   RET
 
 .case_64:
-%endif
   mov             pred_str, pred_stridemp
 .loop_64:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
index 6be99fbca..a79f22d79 100644
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -14,35 +14,62 @@
 #include <stdio.h>
 
 #include "aom_dsp/x86/synonyms.h"
+#include "config/aom_dsp_rtcd.h"
 
-#include "./aom_dsp_rtcd.h"
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, a);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
 
 static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
                                                 int stride) {
-  const __m128i v_val_0_w =
-      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
-  const __m128i v_val_1_w =
-      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
-  const __m128i v_val_2_w =
-      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
-  const __m128i v_val_3_w =
-      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
-  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-  const __m128i v_sum_d =
+  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+  __m128i v_sum_d =
       _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
+  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
   return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
 }
 
+static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                                int height) {
+  int r = 0;
+  __m128i v_acc_q = _mm_setzero_si128();
+  do {
+    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
+  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+  return xx_cvtsi128_si64(v_acc_64);
+}
+
 #ifdef __GNUC__
 // This prevents GCC/Clang from inlining this function into
 // aom_sum_squares_2d_i16_sse2, which in turn saves some stack
@@ -52,72 +79,45 @@ __attribute__((noinline))
 static uint64_t
 aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
                                 int height) {
-  int r, c;
+  int r = 0;
 
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
   __m128i v_acc_q = _mm_setzero_si128();
 
-  for (r = 0; r < height; r += 8) {
+  do {
     __m128i v_acc_d = _mm_setzero_si128();
-
-    for (c = 0; c < width; c += 8) {
+    int c = 0;
+    do {
       const int16_t *b = src + c;
 
-      const __m128i v_val_0_w =
-          _mm_load_si128((const __m128i *)(b + 0 * stride));
-      const __m128i v_val_1_w =
-          _mm_load_si128((const __m128i *)(b + 1 * stride));
-      const __m128i v_val_2_w =
-          _mm_load_si128((const __m128i *)(b + 2 * stride));
-      const __m128i v_val_3_w =
-          _mm_load_si128((const __m128i *)(b + 3 * stride));
-      const __m128i v_val_4_w =
-          _mm_load_si128((const __m128i *)(b + 4 * stride));
-      const __m128i v_val_5_w =
-          _mm_load_si128((const __m128i *)(b + 5 * stride));
-      const __m128i v_val_6_w =
-          _mm_load_si128((const __m128i *)(b + 6 * stride));
-      const __m128i v_val_7_w =
-          _mm_load_si128((const __m128i *)(b + 7 * stride));
+      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
 
       const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
       const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
       const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
       const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
 
       const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
       const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
 
       const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
 
       v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
-    }
+      c += 8;
+    } while (c < width);
 
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
     v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
 
-    src += 8 * stride;
-  }
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
 
   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
-    return tmp;
-  }
-#endif
+  return xx_cvtsi128_si64(v_acc_q);
 }
 
 uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
@@ -127,7 +127,9 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
   // are with size == 4, so it is also the common case.
   if (LIKELY(width == 4 && height == 4)) {
     return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) {
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
     // Generic case
     return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
   } else {
@@ -140,7 +142,7 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
 //////////////////////////////////////////////////////////////////////////////
 
 static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
   __m128i v_acc0_q = _mm_setzero_si128();
   __m128i v_acc1_q = _mm_setzero_si128();
 
@@ -185,16 +187,7 @@ static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
 
   v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
   v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
-    return tmp;
-  }
-#endif
+  return xx_cvtsi128_si64(v_acc0_q);
 }
 
 uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index cd049a454..d9a53fcc5 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -14,7 +14,8 @@
 
 #include <immintrin.h>
 
-#include "./aom_config.h"
+#include "config/aom_config.h"
+
 #include "aom/aom_integer.h"
 
 /**
@@ -58,6 +59,28 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) {
   _mm_storeu_si128((__m128i *)a, v);
 }
 
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, e1, 0, e0);
+#else
+  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, a, 0, a);
+#else
+  return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
 static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
   return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
 }
@@ -89,4 +112,12 @@ static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
 #endif  // AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 000000000..39f371fc9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+  return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+  _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+  _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+  return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+#endif  // AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 000000000..f88a1527d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
deleted file mode 100644
index 1a8fed710..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
-#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
-
-#include <immintrin.h>
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/common_avx2.h"
-
-#define pair256_set_epi16(a, b)                                            \
-  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define pair256_set_epi32(a, b)                                                \
-  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
-                   (int)(b), (int)(a))
-
-static INLINE void mm256_reverse_epi16(__m256i *u) {
-  const __m256i control = _mm256_set_epi16(
-      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
-      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
-  __m256i v = _mm256_shuffle_epi8(*u, control);
-  *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
-static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
-                                 const __m256i *cospi) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i y0 = _mm256_madd_epi16(*a0, *cospi);
-  __m256i y1 = _mm256_madd_epi16(*a1, *cospi);
-
-  y0 = _mm256_add_epi32(y0, dct_rounding);
-  y1 = _mm256_add_epi32(y1, dct_rounding);
-  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
-  return _mm256_packs_epi32(y0, y1);
-}
-
-static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
-  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i u0, u1;
-  int i = 0;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 1);
-
-    u0 = _mm256_unpacklo_epi16(zero, in[i]);
-    u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
-    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
-    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
-    u0 = _mm256_add_epi32(u0, dct_const_rounding);
-    u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
-    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-    in[i] = _mm256_packs_epi32(u0, u1);
-    i++;
-  }
-}
-
-#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
deleted file mode 100644
index 4e6eecd32..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
-#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_storeu_si128((__m128i *)(dst_ptr), out0);
-    _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-  }
-}
-
-#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
index 4257d8b9c..58a792424 100644
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -16,17 +16,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
-#define pair_set_epi16(a, b)                                            \
-  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define dual_set_epi16(a, b)                                            \
-  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
-                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
-
-#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
-  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
-                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
 
 // Reverse the 8 16 bit words in __m128i
 static INLINE __m128i mm_reverse_epi16(const __m128i x) {
@@ -35,292 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
-#if CONFIG_EXT_TX
-// Identity transform (both forward and inverse).
-static INLINE void idtx16_8col(__m128i *in) {
-  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
-  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
-
-  in[0] = _mm_slli_epi16(in[0], 1);
-  in[1] = _mm_slli_epi16(in[1], 1);
-  in[2] = _mm_slli_epi16(in[2], 1);
-  in[3] = _mm_slli_epi16(in[3], 1);
-  in[4] = _mm_slli_epi16(in[4], 1);
-  in[5] = _mm_slli_epi16(in[5], 1);
-  in[6] = _mm_slli_epi16(in[6], 1);
-  in[7] = _mm_slli_epi16(in[7], 1);
-  in[8] = _mm_slli_epi16(in[8], 1);
-  in[9] = _mm_slli_epi16(in[9], 1);
-  in[10] = _mm_slli_epi16(in[10], 1);
-  in[11] = _mm_slli_epi16(in[11], 1);
-  in[12] = _mm_slli_epi16(in[12], 1);
-  in[13] = _mm_slli_epi16(in[13], 1);
-  in[14] = _mm_slli_epi16(in[14], 1);
-  in[15] = _mm_slli_epi16(in[15], 1);
-
-  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
-  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
-  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
-  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
-  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
-  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
-  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
-  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
-
-  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
-  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
-  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
-  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
-  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
-  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
-  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
-  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
-
-  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
-  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
-  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
-  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
-  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
-  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
-  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
-  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
-
-  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
-  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
-  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
-  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
-  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
-  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
-  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
-  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
-
-  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
-  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
-  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
-  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
-  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
-  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
-  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
-  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
-
-  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
-  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
-  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
-  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
-  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
-  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
-  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
-  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
-
-  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
-  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
-  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
-  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
-  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
-  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
-  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
-  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
-
-  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
-  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
-  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
-  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
-  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
-  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
-  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
-  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
-
-  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
-  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
-  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
-  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
-  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
-  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
-  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
-
-  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(v0, x0);
-  in[1] = _mm_packs_epi32(v1, x1);
-  in[2] = _mm_packs_epi32(v2, x2);
-  in[3] = _mm_packs_epi32(v3, x3);
-  in[4] = _mm_packs_epi32(v4, x4);
-  in[5] = _mm_packs_epi32(v5, x5);
-  in[6] = _mm_packs_epi32(v6, x6);
-  in[7] = _mm_packs_epi32(v7, x7);
-
-  in[8] = _mm_packs_epi32(u0, y0);
-  in[9] = _mm_packs_epi32(u1, y1);
-  in[10] = _mm_packs_epi32(u2, y2);
-  in[11] = _mm_packs_epi32(u3, y3);
-  in[12] = _mm_packs_epi32(u4, y4);
-  in[13] = _mm_packs_epi32(u5, y5);
-  in[14] = _mm_packs_epi32(u6, y6);
-  in[15] = _mm_packs_epi32(u7, y7);
-}
-#endif  // CONFIG_EXT_TX
-
-static INLINE void scale_sqrt2_8x4(__m128i *in) {
-  // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
-  // consecutive elements.
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
-  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
-  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
-  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
-  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
-}
-
-static INLINE void scale_sqrt2_8x8(__m128i *in) {
-  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
-  // for each element.
-  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
-  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
-  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
-  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
-  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
-  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
-  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
-  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
-  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
-  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
-  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
-  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
-  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
-  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
-  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
-  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
-
-  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
-  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
-  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
-  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
-  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
-  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
-  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
-  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
-  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
-  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
-  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
-
-  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
-  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
-  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
-  in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
-  in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
-  in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
-  in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
-                          xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
-}
-
-static INLINE void scale_sqrt2_8x16(__m128i *in) {
-  scale_sqrt2_8x8(in);
-  scale_sqrt2_8x8(in + 8);
-}
-
 #endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 18a70dffe..7d6b7d287 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -10,109 +10,224 @@
  */
 
 #include <immintrin.h>
-#include "./aom_dsp_rtcd.h"
-
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse, int *sum);
-
-void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum);
-
-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, int w, int h,
-                          unsigned int *sse, int *sum, get_var_avx2 var_fn,
-                          int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+  return _mm_add_epi16(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
 }
 
-unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                aom_get16x16var_avx2, 16);
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+  return _mm_add_epi32(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
 
-  variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-  _mm256_zeroupper();
-  return variance;
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
 }
 
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  _mm256_zeroupper();
-  return *sse;
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+                                                     unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+                                          unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 =
+      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  vsum = sum_to_32bit_avx2(vsum);
+  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
 }
 
-unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get32x32var_avx2, 32);
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m256i *const vsse,
+                                    __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
-  _mm256_zeroupper();
-  return variance;
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  unsigned int variance;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get32x32var_avx2, 32);
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum;                                                             \
+    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
 
-  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
-  _mm256_zeroupper();
-  return variance;
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum = _mm256_setzero_si256();                                    \
+    for (int i = 0; i < (bh / uh); i++) {                                     \
+      __m256i vsum16;                                                         \
+      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
+      src += uh * src_stride;                                                 \
+      ref += uh * ref_stride;                                                 \
+    }                                                                         \
+    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
+    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
+  }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
 }
 
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
@@ -125,68 +240,164 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
     int height, unsigned int *sseptr);
 
-unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  unsigned int sse1;
-  const int se1 = aom_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
-  unsigned int sse2;
-  const int se2 =
-      aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                      dst + 32, dst_stride, 64, &sse2);
-  const int se = se1 + se2;
-  unsigned int variance;
-  *sse = sse1 + sse2;
-
-  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  const int se = aom_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
-
-  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
-  _mm256_zeroupper();
-  return variance;
-}
-
-unsigned int aom_sub_pixel_avg_variance64x64_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
-  unsigned int sse1;
-  const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
-  unsigned int sse2;
-  const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
-      src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
-      64, 64, &sse2);
-  const int se = se1 + se2;
-  unsigned int variance;
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2);                                                           \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
+  }
+
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    /*Avoid overflow in helper by capping height.*/                       \
+    const int hf = AOMMIN(h, 64);                                         \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, hf, &sse2);                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
 
-  *sse = sse1 + sse2;
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
 
-  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
-  _mm256_zeroupper();
-  return variance;
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
 }
 
-unsigned int aom_sub_pixel_avg_variance32x32_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
-  // Process 32 elements in parallel.
-  const int se = aom_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
-
-  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
-  _mm256_zeroupper();
-  return variance;
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+                                            const __m256i a,
+                                            uint8_t *comp_pred) {
+  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  int i = 0;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      // comp_pred's stride == width == 16
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (16 << 2);
+      i += 4;
+    } while (i < height);
+  } else {  // for width == 32
+    do {
+      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
+      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
+      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
+
+      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
+      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
+      const __m256i aB =
+          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
+
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (32 << 1);
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  }
 }
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
index 999b541e3..88e27aef3 100644
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -11,7 +11,8 @@
 
 #include <immintrin.h>  // AVX2
 
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
 #include "aom_ports/mem.h"
 
 /* clang-format off */
@@ -35,203 +36,6 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
 };
 /* clang-format on */
 
-void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing two strides in a 256 bit register reducing the number
-  // of loop stride by half (comparing to the sse2 code)
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    src = _mm256_inserti128_si256(
-        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
-    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    ref = _mm256_inserti128_si256(
-        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-
-  {
-    __m128i sum_res, madd_res;
-    __m128i expand_sum_low, expand_sum_high, expand_sum;
-    __m128i expand_madd_low, expand_madd_high, expand_madd;
-    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // extract the low lane and add it to the high lane
-    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                            _mm256_extractf128_si256(sum_ref_src, 1));
-
-    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                             _mm256_extractf128_si256(madd_ref_src, 1));
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low =
-        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-    expand_sum_high =
-        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low =
-        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-    expand_madd_high =
-        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
-    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low =
-        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-    ex_expand_sum_high =
-        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
-    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_res = _mm_srli_si128(expand_madd, 8);
-    sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-    madd_res = _mm_add_epi32(madd_res, expand_madd);
-    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
-    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing 32 elements in parallel
-  for (i = 0; i < 16; i++) {
-    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
-    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-
-  {
-    __m256i expand_sum_low, expand_sum_high, expand_sum;
-    __m256i expand_madd_low, expand_madd_high, expand_madd;
-    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-    // extract the low lane and the high lane and add the results
-    *((int *)SSE) =
-        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-  }
-  _mm256_zeroupper();
-}
-
 #define FILTER_SRC(filter)                               \
   /* filter the source */                                \
   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index 211fad3f8..c8c90a7dc 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -12,24 +12,24 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
 
 #include "aom_ports/mem.h"
 
-#include "./av1_rtcd.h"
 #include "av1/common/filter.h"
-
-typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
 
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
   int i;
 
   for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v = xx_loadu_128(src);
     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
     src += 8;
   }
@@ -39,276 +39,265 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   return _mm_cvtsi128_si32(vsum);
 }
 
-#define READ64(p, stride, i)                                  \
-  _mm_unpacklo_epi8(                                          \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
 
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
 
-  // sse
-  vsum =
-      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return _mm_cvtsi128_si32(val);
 }
 
-void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
-                        int ref_stride, unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
 
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-  }
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src, ref);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  // sum
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
 }
 
-void aom_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
 
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = add32x4_sse2(vsum);
+}
 
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+                                                unsigned int *const sse,
+                                                int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-    src += src_stride;
-    ref += ref_stride;
-  }
+  vsum = sum_to_32bit_sse2(vsum);
+  *sum = add32x4_sse2(vsum);
+}
 
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum =
-      (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 256);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
 
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
+  for (int i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src, src_stride);
+    const __m128i r = load4x2_sse2(ref, ref_stride);
 
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride, int w,
-                          int h, unsigned int *sse, int *sum,
-                          getNxMvar_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
+    variance_kernel_sse2(s, r, sse, sum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
   }
 }
 
-unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 4 * 4);
-  assert(sum >= -255 * 4 * 4);
-  return *sse - ((sum * sum) >> 4);
-}
-
-unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 8 * 4);
-  assert(sum >= -255 * 8 * 4);
-  return *sse - ((sum * sum) >> 5);
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+  for (int i = 0; i < h; i++) {
+    const __m128i s = load8_8to16_sse2(src);
+    const __m128i r = load8_8to16_sse2(ref);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 8 * 4);
-  assert(sum >= -255 * 8 * 4);
-  return *sse - ((sum * sum) >> 5);
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
 }
 
-unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 8 * 8);
-  assert(sum >= -255 * 8 * 8);
-  return *sse - ((sum * sum) >> 6);
-}
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 64);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
 
-unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 16 * 8);
-  assert(sum >= -255 * 16 * 8);
-  return *sse - ((sum * sum) >> 7);
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src, ref, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 16 * 8);
-  assert(sum >= -255 * 16 * 8);
-  return *sse - ((sum * sum) >> 7);
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  assert(sum <= 255 * 16 * 16);
-  assert(sum >= -255 * 16 * 16);
-  return *sse - ((uint32_t)((int64_t)sum * sum) >> 8);
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 16);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 32);
-  assert(sum >= -255 * 32 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m128i *const sse,
+                                    __m128i *const sum) {
+  assert(h <= 8);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const int offset0 = j << 5;
+      const int offset1 = offset0 + 16;
+      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum;                                                             \
+    int sum = 0;                                                              \
+    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 32 * 16);
-  assert(sum >= -255 * 32 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
-}
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum = _mm_setzero_si128();                                       \
+    for (int i = 0; i < (bh / uh); ++i) {                                     \
+      __m128i vsum16;                                                         \
+      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
+      src += (src_stride * uh);                                               \
+      ref += (ref_stride * uh);                                               \
+    }                                                                         \
+    *sse = add32x4_sse2(vsse);                                                \
+    int sum = add32x4_sse2(vsum);                                             \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
 
-unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 64);
-  assert(sum >= -255 * 64 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
-}
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
 
-unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
 
-unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 32);
-  assert(sum >= -255 * 64 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
 
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
@@ -338,74 +327,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
-unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 4 * 16);
-  assert(sum >= -255 * 4 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
-}
-
-unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum,
-                get4x4var_sse2, 4);
-  assert(sum <= 255 * 16 * 4);
-  assert(sum >= -255 * 16 * 4);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 6);
-}
-
-unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 8 * 32);
-  assert(sum >= -255 * 8 * 32);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
-}
-
-unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride,
-                                   const uint8_t *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum,
-                aom_get8x8var_sse2, 8);
-  assert(sum <= 255 * 32 * 8);
-  assert(sum >= -255 * 32 * 8);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 8);
-}
-
-unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 16 * 64);
-  assert(sum >= -255 * 16 * 64);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum,
-                aom_get16x16var_sse2, 16);
-  assert(sum <= 255 * 64 * 16);
-  assert(sum >= -255 * 64 * 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-#endif
-
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
 #define DECL(w, opt)                                                           \
@@ -423,75 +344,57 @@ DECLS(ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
-    unsigned int sse;                                                          \
-    int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
-                                                  y_offset, dst, dst_stride,   \
-                                                  h, &sse, NULL, NULL);        \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_sub_pixel_variance##wf##xh_##opt(                          \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));   \
-  FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t));    \
-  FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t));  \
-  FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
-#else
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
-#endif
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 
 FNS(sse2);
 FNS(ssse3);
@@ -516,76 +419,61 @@ DECLS(ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                    \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
-      const uint8_t *sec) {                                                    \
-    unsigned int sse;                                                          \
-    int se = aom_sub_pixel_avg_variance##wf##xh_##opt(                         \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                      \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sseptr = sse;                                                             \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
   }
 
-#if CONFIG_EXT_PARTITION_TYPES
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));   \
-  FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t));    \
-  FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t));  \
-  FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t))
-#else
-#define FNS(opt)                                    \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));    \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
-#endif
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
 
 FNS(sse2);
 FNS(ssse3);
@@ -593,9 +481,97 @@ FNS(ssse3);
 #undef FNS
 #undef FN
 
-void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
                              int subpel_x_q3, int subpel_y_q3,
                              const uint8_t *ref, int ref_stride) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams filter =
+      av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
       int i;
@@ -604,8 +580,7 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       for (i = 0; i < height; i++) {
         int j;
         for (j = 0; j < width; j += 16) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
           comp_pred += 16;
           ref += 16;
         }
@@ -617,10 +592,9 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       assert(!(height & 1));
       /*Read 8 pixels two rows at a time.*/
       for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
         comp_pred += 16;
         ref += 2 * ref_stride;
       }
@@ -630,69 +604,62 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
       assert(!(height & 3));
       /*Read 4 pixels four rows at a time.*/
       for (i = 0; i < height; i++) {
-        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
-        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride));
-        __m128i s2 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride));
-        __m128i s3 =
-            _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride));
-        __m128i t0 = _mm_unpacklo_epi32(s0, s1);
-        __m128i t1 = _mm_unpacklo_epi32(s2, s3);
-        __m128i u0 = _mm_unpacklo_epi64(t0, t1);
-        _mm_storeu_si128((__m128i *)comp_pred, u0);
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
         comp_pred += 16;
         ref += 4 * ref_stride;
       }
     }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
   } else {
-    InterpFilterParams filter;
-    filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
-    if (!subpel_y_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-    } else if (!subpel_x_q3) {
-      const int16_t *kernel;
-      kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-    } else {
-      DECLARE_ALIGNED(16, uint8_t,
-                      temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-      const int16_t *kernel_x;
-      const int16_t *kernel_y;
-      int intermediate_height;
-      kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-      kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-      intermediate_height =
-          (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
-      assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-      aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-      aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-    }
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride,
+                        temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                        intermediate_height);
+    aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
+                       MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                       width, height);
   }
 }
 
-void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
-                                      int width, int height, int subpel_x_q3,
-                                      int subpel_y_q3, const uint8_t *ref,
-                                      int ref_stride) {
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride) {
   int n;
   int i;
-  aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
-                     ref_stride);
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride);
   /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
   assert(!(width * height & 15));
   n = width * height >> 4;
   for (i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0));
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
     comp_pred += 16;
     pred += 16;
   }