From 277f2116b6660e9bbe7f5d67524be57eceb49b8b Mon Sep 17 00:00:00 2001
From: "Matt A. Tobin" <email@mattatobin.com>
Date: Tue, 7 Apr 2020 23:30:51 -0400
Subject: Move aom source to a sub-directory under media/libaom

There is no damned reason to treat this differently than any other media lib given its license and there never was.
---
 third_party/aom/aom_dsp/x86/aom_asm_stubs.c        |   89 -
 .../aom/aom_dsp/x86/aom_convolve_copy_sse2.asm     |  297 ---
 .../aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm  |  613 -----
 .../x86/aom_high_subpixel_bilinear_sse2.asm        |  338 ---
 .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c  | 1441 ------------
 .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c |  315 ---
 .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm       |  615 -----
 .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm      |  870 -------
 .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm |  295 ---
 .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm    |  267 ---
 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c |   34 -
 third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c  |  900 --------
 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c  | 1109 ---------
 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c |  283 ---
 third_party/aom/aom_dsp/x86/blend_mask_sse4.h      |  237 --
 third_party/aom/aom_dsp/x86/blend_sse4.h           |  191 --
 third_party/aom/aom_dsp/x86/common_avx2.h          |  147 --
 third_party/aom/aom_dsp/x86/convolve.h             |  178 --
 third_party/aom/aom_dsp/x86/convolve_avx2.h        |  199 --
 .../aom/aom_dsp/x86/convolve_common_intrin.h       |   31 -
 third_party/aom/aom_dsp/x86/convolve_sse2.h        |  121 -
 third_party/aom/aom_dsp/x86/convolve_sse4_1.h      |   53 -
 third_party/aom/aom_dsp/x86/fft_avx2.c             |   73 -
 third_party/aom/aom_dsp/x86/fft_sse2.c             |  166 --
 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h   |  344 ---
 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c        |   69 -
 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h        |  155 --
 .../aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm      |  379 ----
 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c |  998 --------
 .../aom/aom_dsp/x86/highbd_convolve_ssse3.c        |  251 --
 .../aom/aom_dsp/x86/highbd_intrapred_sse2.c        |  984 --------
 .../aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm  |  259 ---
 .../aom/aom_dsp/x86/highbd_loopfilter_avx2.c       |   66 -
 .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c       | 1697 --------------
 .../aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c  |  160 --
 .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c  |  148 --
 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm  |  296 ---
 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm    |  374 ---
 .../x86/highbd_subpel_variance_impl_sse2.asm       | 1036 ---------
 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c |  267 ---
 third_party/aom/aom_dsp/x86/highbd_variance_avx2.c |  140 --
 .../aom/aom_dsp/x86/highbd_variance_impl_sse2.asm  |  318 ---
 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c |  868 -------
 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c |  216 --
 third_party/aom/aom_dsp/x86/intrapred_avx2.c       |  811 -------
 third_party/aom/aom_dsp/x86/intrapred_sse2.c       | 1430 ------------
 third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm |  625 -----
 third_party/aom/aom_dsp/x86/intrapred_ssse3.c      | 1692 --------------
 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm       |  107 -
 third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c        |  238 --
 third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c   |  192 --
 third_party/aom/aom_dsp/x86/loopfilter_sse2.c      | 2385 --------------------
 third_party/aom/aom_dsp/x86/lpf_common_sse2.h      |  215 --
 .../aom/aom_dsp/x86/masked_sad_intrin_avx2.c       |  389 ----
 .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c      |  402 ----
 .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.h      |   33 -
 .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 1064 ---------
 .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.h |   92 -
 third_party/aom/aom_dsp/x86/mem_sse2.h             |   42 -
 third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h  |   58 -
 third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h |   54 -
 third_party/aom/aom_dsp/x86/obmc_sad_avx2.c        |  270 ---
 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c        |  268 ---
 third_party/aom/aom_dsp/x86/obmc_variance_avx2.c   |  190 --
 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c   |  380 ----
 .../aom/aom_dsp/x86/quantize_avx_x86_64.asm        |  435 ----
 third_party/aom/aom_dsp/x86/quantize_sse2.c        |  147 --
 .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm      |  272 ---
 third_party/aom/aom_dsp/x86/quantize_x86.h         |   77 -
 third_party/aom/aom_dsp/x86/sad4d_avx2.c           |  218 --
 third_party/aom/aom_dsp/x86/sad4d_sse2.asm         |  257 ---
 third_party/aom/aom_dsp/x86/sad_avx2.c             |  189 --
 third_party/aom/aom_dsp/x86/sad_highbd_avx2.c      | 1038 ---------
 third_party/aom/aom_dsp/x86/sad_impl_avx2.c        |  234 --
 third_party/aom/aom_dsp/x86/sad_sse2.asm           |  353 ---
 third_party/aom/aom_dsp/x86/sse_avx2.c             |  250 --
 third_party/aom/aom_dsp/x86/sse_sse4.c             |  241 --
 third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm    |  222 --
 .../aom/aom_dsp/x86/subpel_variance_sse2.asm       | 1481 ------------
 third_party/aom/aom_dsp/x86/subtract_avx2.c        |  108 -
 third_party/aom/aom_dsp/x86/subtract_sse2.asm      |  146 --
 third_party/aom/aom_dsp/x86/sum_squares_avx2.c     |   79 -
 third_party/aom/aom_dsp/x86/sum_squares_sse2.c     |  203 --
 third_party/aom/aom_dsp/x86/sum_squares_sse2.h     |   22 -
 third_party/aom/aom_dsp/x86/synonyms.h             |  114 -
 third_party/aom/aom_dsp/x86/synonyms_avx2.h        |   74 -
 third_party/aom/aom_dsp/x86/transpose_sse2.h       |  420 ----
 third_party/aom/aom_dsp/x86/txfm_common_avx2.h     |  199 --
 third_party/aom/aom_dsp/x86/txfm_common_sse2.h     |   29 -
 third_party/aom/aom_dsp/x86/variance_avx2.c        |  517 -----
 third_party/aom/aom_dsp/x86/variance_impl_avx2.c   |  517 -----
 third_party/aom/aom_dsp/x86/variance_impl_ssse3.c  |  129 --
 third_party/aom/aom_dsp/x86/variance_sse2.c        |  806 -------
 93 files changed, 37601 deletions(-)
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_asm_stubs.c
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_mask_sse4.h
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_sse4.h
 delete mode 100644 third_party/aom/aom_dsp/x86/common_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_common_intrin.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_sse4_1.h
 delete mode 100644 third_party/aom/aom_dsp/x86/fft_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/fft_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/loopfilter_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/lpf_common_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
 delete mode 100644 third_party/aom/aom_dsp/x86/mem_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_x86.h
 delete mode 100644 third_party/aom/aom_dsp/x86/sad4d_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad4d_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_impl_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/sse_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sse_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/subtract_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/subtract_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/sum_squares_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/synonyms.h
 delete mode 100644 third_party/aom/aom_dsp/x86/synonyms_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/transpose_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/txfm_common_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_impl_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_sse2.c

(limited to 'third_party/aom/aom_dsp/x86')

diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
deleted file mode 100644
index 5f5bf5f14..000000000
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-
-#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
-#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
-#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
-#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
-#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
-#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#if ARCH_X86_64
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-
-// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#endif  // ARCH_X86_64
-#endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
deleted file mode 100644
index 7283c32b8..000000000
--- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,297 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
-%endif
-  mov r4d, dword wm
-%ifidn %2, highbd
-  shl r4d, 1
-  shl srcq, 1
-  shl src_strideq, 1
-  shl dstq, 1
-  shl dst_strideq, 1
-%else
-  cmp r4d, 4
-  je .w4
-%endif
-  cmp r4d, 8
-  je .w8
-  cmp r4d, 16
-  je .w16
-  cmp r4d, 32
-  je .w32
-
-  cmp r4d, 64
-  je .w64
-%ifidn %2, highbd
-  cmp r4d, 128
-  je .w128
-
-.w256:
-  mov                    r4d, dword hm
-.loop256:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  movu                    m0, [srcq+128]
-  movu                    m1, [srcq+128+16]
-  movu                    m2, [srcq+128+32]
-  movu                    m3, [srcq+128+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq+128]
-  pavg                    m1, [dstq+128+16]
-  pavg                    m2, [dstq+128+32]
-  pavg                    m3, [dstq+128+48]
-%endif
-  mova         [dstq+128   ], m0
-  mova         [dstq+128+16], m1
-  mova         [dstq+128+32], m2
-  mova         [dstq+128+48], m3
-  movu                    m0, [srcq+128+64]
-  movu                    m1, [srcq+128+80]
-  movu                    m2, [srcq+128+96]
-  movu                    m3, [srcq+128+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+128+64]
-  pavg                    m1, [dstq+128+80]
-  pavg                    m2, [dstq+128+96]
-  pavg                    m3, [dstq+128+112]
-%endif
-  mova         [dstq+128+64], m0
-  mova         [dstq+128+80], m1
-  mova         [dstq+128+96], m2
-  mova        [dstq+128+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop256
-  RET
-%endif
-
-.w128:
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-
-.w64:
-  mov                    r4d, dword hm
-.loop64:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop64
-  RET
-
-.w32:
-  mov                    r4d, dword hm
-.loop32:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+src_strideq]
-  movu                    m3, [srcq+src_strideq+16]
-  lea                   srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq            +16]
-  pavg                    m2, [dstq+dst_strideq]
-  pavg                    m3, [dstq+dst_strideq+16]
-%endif
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq+dst_strideq   ], m2
-  mova [dstq+dst_strideq+16], m3
-  lea                   dstq, [dstq+dst_strideq*2]
-  sub                    r4d, 2
-  jnz .loop32
-  RET
-
-.w16:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop16:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop16
-  RET
-
-.w8:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop8:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
-  movd                    m0, [srcq]
-  movd                    m1, [srcq+src_strideq]
-  movd                    m2, [srcq+src_strideq*2]
-  movd                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movd                    m4, [dstq]
-  movd                    m5, [dstq+dst_strideq]
-  movd                    m6, [dstq+dst_strideq*2]
-  movd                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movd  [dstq              ], m0
-  movd  [dstq+dst_strideq  ], m1
-  movd  [dstq+dst_strideq*2], m2
-  movd  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop4
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-convolve_fn copy, highbd
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
deleted file mode 100644
index b6f040791..000000000
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,613 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro HIGH_GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm6
-    punpcklwd   xmm2, xmm5
-    punpcklwd   xmm3, xmm4
-    punpcklwd   xmm1, xmm7
-
-    movdqa      k0k6, xmm0
-    movdqa      k2k5, xmm2
-    movdqa      k3k4, xmm3
-    movdqa      k1k7, xmm1
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    ;Compute max and min values of a pixel
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)      ;bps
-    movq        xmm0, rdx
-    movq        xmm1, rcx
-    pshufd      xmm0, xmm0, 0b
-    movdqa      xmm2, xmm0
-    psllw       xmm0, xmm1
-    psubw       xmm0, xmm2
-    pxor        xmm1, xmm1
-    movdqa      max, xmm0                  ;max value (for clamping)
-    movdqa      min, xmm1                  ;min value (for clamping)
-
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
-    punpcklwd   xmm0, xmm6                  ;two row in one register
-    punpcklwd   xmm1, xmm7
-    punpcklwd   xmm2, xmm5
-    punpcklwd   xmm3, xmm4
-
-    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
-    pmaddwd     xmm1, k1k7
-    pmaddwd     xmm2, k2k5
-    pmaddwd     xmm3, k3k4
-
-    paddd       xmm0, xmm1                  ;sum
-    paddd       xmm0, xmm2
-    paddd       xmm0, xmm3
-
-    paddd       xmm0, krd                   ;rounding
-    psrad       xmm0, 7                     ;shift
-    packssdw    xmm0, xmm0                  ;pack to word
-
-    ;clamp the values
-    pminsw      xmm0, max
-    pmaxsw      xmm0, min
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgw       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0
-%endm
-
-%macro HIGH_GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-    punpcklwd   xmm0, xmm1
-    punpckhwd   xmm6, xmm7
-    punpckhwd   xmm2, xmm5
-    punpckhwd   xmm3, xmm4
-
-    movdqa      k0k1, xmm0                  ;store filter factors on stack
-    movdqa      k6k7, xmm6
-    movdqa      k2k5, xmm2
-    movdqa      k3k4, xmm3
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    ;Compute max and min values of a pixel
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm0, rdx
-    movq        xmm1, rcx
-    pshufd      xmm0, xmm0, 0b
-    movdqa      xmm2, xmm0
-    psllw       xmm0, xmm1
-    psubw       xmm0, xmm2
-    pxor        xmm1, xmm1
-    movdqa      max, xmm0                  ;max value (for clamping)
-    movdqa      min, xmm1                  ;min value (for clamping)
-%endm
-
-%macro LOAD_VERT_8 1
-    movdqu      xmm0, [rsi + %1]            ;0
-    movdqu      xmm1, [rsi + rax + %1]      ;1
-    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
-    movdqu      xmm2, [rsi + rax + %1]      ;2
-    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
-    movdqu      xmm4, [rsi + rdx + %1]      ;4
-    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro HIGH_APPLY_FILTER_8 2
-    movdqu      temp, xmm4
-    movdqa      xmm4, xmm0
-    punpcklwd   xmm0, xmm1
-    punpckhwd   xmm4, xmm1
-    movdqa      xmm1, xmm6
-    punpcklwd   xmm6, xmm7
-    punpckhwd   xmm1, xmm7
-    movdqa      xmm7, xmm2
-    punpcklwd   xmm2, xmm5
-    punpckhwd   xmm7, xmm5
-
-    movdqu      xmm5, temp
-    movdqu      temp, xmm4
-    movdqa      xmm4, xmm3
-    punpcklwd   xmm3, xmm5
-    punpckhwd   xmm4, xmm5
-    movdqu      xmm5, temp
-
-    pmaddwd     xmm0, k0k1
-    pmaddwd     xmm5, k0k1
-    pmaddwd     xmm6, k6k7
-    pmaddwd     xmm1, k6k7
-    pmaddwd     xmm2, k2k5
-    pmaddwd     xmm7, k2k5
-    pmaddwd     xmm3, k3k4
-    pmaddwd     xmm4, k3k4
-
-    paddd       xmm0, xmm6
-    paddd       xmm0, xmm2
-    paddd       xmm0, xmm3
-    paddd       xmm5, xmm1
-    paddd       xmm5, xmm7
-    paddd       xmm5, xmm4
-
-    paddd       xmm0, krd                   ;rounding
-    paddd       xmm5, krd
-    psrad       xmm0, 7                     ;shift
-    psrad       xmm5, 7
-    packssdw    xmm0, xmm5                  ;pack back to word
-
-    ;clamp the values
-    pminsw      xmm0, max
-    pmaxsw      xmm0, min
-
-%if %1
-    movdqu      xmm1, [rdi + %2]
-    pavgw       xmm0, xmm1
-%endif
-    movdqu      [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movq        xmm0, [rsi]                 ;load src: row 0
-    movq        xmm1, [rsi + rax]           ;1
-    movq        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2]       ;7
-    movq        xmm2, [rsi + rax]           ;2
-    movq        xmm3, [rsi + rax * 2]       ;3
-    movq        xmm4, [rsi + rdx]           ;4
-    movq        xmm5, [rsi + rax * 4]       ;5
-
-    HIGH_APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 16
-    HIGH_APPLY_FILTER_8 0, 16
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm4,   [rsi + 2]
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm4
-    movdqa      xmm7, xmm4
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm4
-
-    psrldq      xmm1, 2
-    psrldq      xmm6, 4
-    psrldq      xmm7, 6
-    psrldq      xmm2, 4
-    psrldq      xmm3, 6
-    psrldq      xmm5, 2
-
-    HIGH_APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 10]           ;load src
-    movdqu      xmm1,   [rsi + 12]
-    movdqu      xmm2,   [rsi + 14]
-    movdqu      xmm3,   [rsi + 16]
-    movdqu      xmm4,   [rsi + 18]
-    movdqu      xmm5,   [rsi + 20]
-    movdqu      xmm6,   [rsi + 22]
-    movdqu      xmm7,   [rsi + 24]
-
-    HIGH_APPLY_FILTER_8 0, 16
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
deleted file mode 100644
index 7b3fe6419..000000000
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,338 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro HIGH_GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklwd   xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm5, rdx
-    movq        xmm2, rcx
-    pshufd      xmm5, xmm5, 0b
-    movdqa      xmm1, xmm5
-    psllw       xmm5, xmm2
-    psubw       xmm5, xmm1                  ;max value (for clamping)
-    pxor        xmm2, xmm2                  ;min value (for clamping)
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
-
-    punpcklwd   xmm0, xmm1                  ;two row in one register
-    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
-
-    paddd       xmm0, xmm3                  ;rounding
-    psrad       xmm0, 7                     ;shift
-    packssdw    xmm0, xmm0                  ;pack to word
-
-    ;clamp the values
-    pminsw      xmm0, xmm5
-    pmaxsw      xmm0, xmm2
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgw       xmm0, xmm1
-%endif
-
-    movq        [rdi], xmm0
-    lea         rsi, [rsi + 2*rax]
-    lea         rdi, [rdi + 2*rdx]
-    dec         rcx
-%endm
-
-%if ARCH_X86_64
-%macro HIGH_GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm6, [rdx]                 ;load filters
-
-    pshuflw     xmm7, xmm6, 11111111b       ;k3
-    pshufhw     xmm6, xmm6, 0b              ;k4
-    psrldq      xmm6, 8
-    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm8, rdx
-    movq        xmm5, rcx
-    pshufd      xmm8, xmm8, 0b
-    movdqa      xmm1, xmm8
-    psllw       xmm8, xmm5
-    psubw       xmm8, xmm1                  ;max value (for clamping)
-    pxor        xmm5, xmm5                  ;min value (for clamping)
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_8 1
-    movdqa      xmm6, xmm0
-    punpckhwd   xmm6, xmm1
-    punpcklwd   xmm0, xmm1
-    pmaddwd     xmm6, xmm7
-    pmaddwd     xmm0, xmm7
-
-    paddd       xmm6, xmm4                  ;rounding
-    paddd       xmm0, xmm4                  ;rounding
-    psrad       xmm6, 7                     ;shift
-    psrad       xmm0, 7                     ;shift
-    packssdw    xmm0, xmm6                  ;pack back to word
-
-    ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
-
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgw       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + 2*rax]
-    lea         rdi, [rdi + 2*rdx]
-    dec         rcx
-%endm
-
-%macro HIGH_APPLY_FILTER_16 1
-    movdqa      xmm9, xmm0
-    movdqa      xmm6, xmm2
-    punpckhwd   xmm9, xmm1
-    punpckhwd   xmm6, xmm3
-    punpcklwd   xmm0, xmm1
-    punpcklwd   xmm2, xmm3
-
-    pmaddwd     xmm9, xmm7
-    pmaddwd     xmm6, xmm7
-    pmaddwd     xmm0, xmm7
-    pmaddwd     xmm2, xmm7
-
-    paddd       xmm9, xmm4                  ;rounding
-    paddd       xmm6, xmm4
-    paddd       xmm0, xmm4
-    paddd       xmm2, xmm4
-
-    psrad       xmm9, 7                     ;shift
-    psrad       xmm6, 7
-    psrad       xmm0, 7
-    psrad       xmm2, 7
-
-    packssdw    xmm0, xmm9                  ;pack back to word
-    packssdw    xmm2, xmm6                  ;pack back to word
-
-    ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
-    pminsw      xmm2, xmm8
-    pmaxsw      xmm2, xmm5
-
-%if %1
-    movdqu      xmm1, [rdi]
-    movdqu      xmm3, [rdi + 16]
-    pavgw       xmm0, xmm1
-    pavgw       xmm2, xmm3
-%endif
-    movdqu      [rdi], xmm0               ;store the result
-    movdqu      [rdi + 16], xmm2          ;store the result
-
-    lea         rsi, [rsi + 2*rax]
-    lea         rdi, [rdi + 2*rdx]
-    dec         rcx
-%endm
-%endif
-
-SECTION .text
-
-global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movq        xmm0, [rsi]                 ;load src
-    movq        xmm1, [rsi + 2*rax]
-
-    HIGH_APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;0
-    movdqu      xmm1, [rsi + 2*rax]         ;1
-
-    HIGH_APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm2, [rsi + 16]
-    movdqu        xmm1, [rsi + 2*rax]       ;1
-    movdqu        xmm3, [rsi + 2*rax + 16]
-
-    HIGH_APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
-
-global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 2
-
-    HIGH_APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqu      xmm1, [rsi + 2]
-
-    HIGH_APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 2]
-    movdqu      xmm2,   [rsi + 16]
-    movdqu      xmm3,   [rsi + 18]
-
-    HIGH_APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index 94b5da171..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,1441 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_ports/mem.h"
-
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
-    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
-      (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else  // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else  // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // gcc <= 4.6
-#else   // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
-                                    const ptrdiff_t stride, const __m256i *a) {
-  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
-  *((uint32_t *)(output_ptr + stride)) =
-      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
-  return a;
-}
-
-static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
-                                    const ptrdiff_t stride, const __m256i *a) {
-  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
-  _mm_storel_epi64((__m128i *)(output_ptr + stride),
-                   _mm256_extractf128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
-static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
-                                   const ptrdiff_t stride, const __m256i *a) {
-  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
-  _mm_store_si128((__m128i *)(output_ptr + stride),
-                  _mm256_extractf128_si256(*a, 1));
-}
-
-static void aom_filter_block1d4_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  firstFilters =
-      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    srcRegFilt32b1_1 =
-        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 4 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
-    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 4 bytes
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d4_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg;
-  __m256i firstFilters, secondFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
-  __m256i srcReg32b1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 32 bits
-  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
-  // duplicate only the second 32 bits
-  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    // filter the source buffer
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    srcRegFilt32b1_1 =
-        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 4 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
-    // filter the source buffer
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 4 bytes
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt2Reg, filt3Reg;
-  __m256i secondFilters, thirdFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 8 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 8 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d16_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt2Reg, filt3Reg;
-  __m256i secondFilters, thirdFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m256i srcReg1, srcReg12;
-    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
-
-    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
-    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
-
-    // filter the source buffer
-    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
-    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
-    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
-    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
-    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt1_1));
-  }
-}
-
-static void aom_filter_block1d16_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 =
-        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg45_56_lo;
-  __m256i resReg23_34_lo, resReg45_56_lo;
-  __m256i resReglo, resReg;
-  __m256i secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
-    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReglo);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4x = srcReg6x;
-  }
-}
-
-static void aom_filter_block1d8_v8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
-  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
-  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // shift by 6 bit each 16 bit
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
-  }
-}
-
-static void aom_filter_block1d16_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
-  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
-  __m256i resReglo, resReghi, resReg;
-  __m256i secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
-    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
-    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
-
-    // add and saturate the results together
-    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-    resReghi = _mm256_srai_epi16(resReghi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReghi);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg23_34_hi = srcReg45_56_hi;
-    srcReg4x = srcReg6x;
-  }
-}
-
-static void aom_filter_block1d16_v8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
-  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
-  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // shift by 6 bit each 16 bit
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b1 = srcReg32b3;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b3 = srcReg32b5;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b5 = srcReg32b7;
-    srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 =
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 =
-        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
-  }
-}
-
-static void aom_filter_block1d4_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg45_56_lo;
-  __m256i srcReg2345_3456_lo;
-  __m256i resReglo, resReg;
-  __m256i firstFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  firstFilters =
-      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
-    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
-
-    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReglo);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4x = srcReg6x;
-  }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
-#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
-#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
-#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
-#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
-#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
-#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
-// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 325a21b76..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-
-void aom_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += out_pitch;
-  }
-}
-
-filter8_1dfunction aom_filter_block1d16_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-
-#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
-#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
-#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
-#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
-
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-
-// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
deleted file mode 100644
index c88fc9ffb..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,615 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklqdq  xmm0, xmm1
-    punpcklqdq  xmm2, xmm3
-    punpcklqdq  xmm5, xmm4
-    punpcklqdq  xmm6, xmm7
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm2
-    movdqa      k5k4, xmm5
-    movdqa      k6k7, xmm6
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpckldq   xmm6, xmm7
-    punpckldq   xmm2, xmm3
-    punpckldq   xmm5, xmm4
-
-    punpcklbw   xmm0, zero                  ;unpack to word
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-
-    pmullw      xmm0, k0k1                  ;multiply the filter factors
-    pmullw      xmm6, k6k7
-    pmullw      xmm2, k2k3
-    pmullw      xmm5, k5k4
-
-    paddsw      xmm0, xmm6                  ;sum
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm2
-    psrldq      xmm2, 8
-    paddsw      xmm0, xmm5
-    psrldq      xmm5, 8
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm0
-    punpcklwd   xmm1, xmm1
-    punpcklwd   xmm2, xmm2
-    punpcklwd   xmm3, xmm3
-    punpckhwd   xmm4, xmm4
-    punpckhwd   xmm5, xmm5
-    punpckhwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movdqa      k0,   xmm0                  ;store filter factors on stack
-    movdqa      k1,   xmm1
-    movdqa      k2,   xmm2
-    movdqa      k3,   xmm3
-    movdqa      k4,   xmm4
-    movdqa      k5,   xmm5
-    movdqa      k6,   xmm6
-    movdqa      k7,   xmm7
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
-    movq        xmm0, [rsi + %1]            ;0
-    movq        xmm1, [rsi + rax + %1]      ;1
-    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
-    movq        xmm2, [rsi + rax + %1]      ;2
-    movq        xmm3, [rsi + rax * 2 + %1]  ;3
-    movq        xmm4, [rsi + rdx + %1]      ;4
-    movq        xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro APPLY_FILTER_8 2
-    punpcklbw   xmm0, zero
-    punpcklbw   xmm1, zero
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm7, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-    punpcklbw   xmm3, zero
-    punpcklbw   xmm4, zero
-
-    pmullw      xmm0, k0
-    pmullw      xmm1, k1
-    pmullw      xmm6, k6
-    pmullw      xmm7, k7
-    pmullw      xmm2, k2
-    pmullw      xmm5, k5
-    pmullw      xmm3, k3
-    pmullw      xmm4, k4
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-    paddsw      xmm0, xmm3
-    paddsw      xmm0, xmm4
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi + %2]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d4_v8_sse2) PRIVATE
-sym(aom_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d8_v8_sse2) PRIVATE
-sym(aom_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d16_v8_sse2) PRIVATE
-sym(aom_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 0, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d4_h8_sse2) PRIVATE
-sym(aom_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d8_h8_sse2) PRIVATE
-sym(aom_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d16_h8_sse2) PRIVATE
-sym(aom_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
deleted file mode 100644
index 3ca7921b6..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,870 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64:    times 8 dw 64
-even_byte_mask: times 8 dw 0x00ff
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffav1) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%define LOCAL_VARS_SIZE 16*6
-
-%macro SETUP_LOCAL_VARS 0
-    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
-    ; pmaddubsw has a higher latency on some platforms, this might be eased by
-    ; interleaving the instructions.
-    %define    k0k1  [rsp + 16*0]
-    %define    k2k3  [rsp + 16*1]
-    %define    k4k5  [rsp + 16*2]
-    %define    k6k7  [rsp + 16*3]
-    packsswb     m4, m4
-    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
-    ; some platforms.
-    pshuflw      m0, m4, 0b              ;k0_k1
-    pshuflw      m1, m4, 01010101b       ;k2_k3
-    pshuflw      m2, m4, 10101010b       ;k4_k5
-    pshuflw      m3, m4, 11111111b       ;k6_k7
-    punpcklqdq   m0, m0
-    punpcklqdq   m1, m1
-    punpcklqdq   m2, m2
-    punpcklqdq   m3, m3
-    mova       k0k1, m0
-    mova       k2k3, m1
-    mova       k4k5, m2
-    mova       k6k7, m3
-%if ARCH_X86_64
-    %define     krd  m12
-    %define    tmp0  [rsp + 16*4]
-    %define    tmp1  [rsp + 16*5]
-    mova        krd, [GLOBAL(pw_64)]
-%else
-    %define     krd  [rsp + 16*4]
-%if CONFIG_PIC=0
-    mova         m6, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb      m6, m6                  ;all ones
-    psrlw        m6, 15
-    psllw        m6, 6                   ;aka pw_64
-%endif
-    mova        krd, m6
-%endif
-%endm
-
-;-------------------------------------------------------------------------------
-%if ARCH_X86_64
-  %define LOCAL_VARS_SIZE_H4 0
-%else
-  %define LOCAL_VARS_SIZE_H4 16*4
-%endif
-
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
-                            src, sstride, dst, dstride, height, filter
-    mova                m4, [filterq]
-    packsswb            m4, m4
-%if ARCH_X86_64
-    %define       k0k1k4k5  m8
-    %define       k2k3k6k7  m9
-    %define            krd  m10
-    mova               krd, [GLOBAL(pw_64)]
-    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
-    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
-    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
-    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
-    %define       k0k1k4k5  [rsp + 16*0]
-    %define       k2k3k6k7  [rsp + 16*1]
-    %define            krd  [rsp + 16*2]
-    pshuflw             m6, m4, 0b              ;k0_k1
-    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
-    pshuflw             m7, m4, 01010101b       ;k2_k3
-    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
-%if CONFIG_PIC=0
-    mova                m1, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb             m1, m1                  ;all ones
-    psrlw               m1, 15
-    psllw               m1, 6                   ;aka pw_64
-%endif
-    mova          k0k1k4k5, m6
-    mova          k2k3k6k7, m7
-    mova               krd, m1
-%endif
-    dec            heightd
-
-.loop:
-    ;Do two rows at once
-    movu                m4, [srcq - 3]
-    movu                m5, [srcq + sstrideq - 3]
-    punpckhbw           m1, m4, m4
-    punpcklbw           m4, m4
-    punpckhbw           m3, m5, m5
-    punpcklbw           m5, m5
-    palignr             m0, m1, m4, 1
-    pmaddubsw           m0, k0k1k4k5
-    palignr             m1, m4, 5
-    pmaddubsw           m1, k2k3k6k7
-    palignr             m2, m3, m5, 1
-    pmaddubsw           m2, k0k1k4k5
-    palignr             m3, m5, 5
-    pmaddubsw           m3, k2k3k6k7
-    punpckhqdq          m4, m0, m2
-    punpcklqdq          m0, m2
-    punpckhqdq          m5, m1, m3
-    punpcklqdq          m1, m3
-    paddsw              m0, m4
-    paddsw              m1, m5
-%ifidn %1, h8_avg
-    movd                m4, [dstq]
-    movd                m5, [dstq + dstrideq]
-%endif
-    paddsw              m0, m1
-    paddsw              m0, krd
-    psraw               m0, 7
-%ifidn %1, h8_add_src
-    pxor                 m3, m3
-    movu                 m4, [srcq]
-    movu                 m5, [srcq + sstrideq]
-    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
-    punpcklbw            m4, m3
-    paddsw               m0, m4
-%endif
-    packuswb            m0, m0
-    psrldq              m1, m0, 4
-
-%ifidn %1, h8_avg
-    pavgb               m0, m4
-    pavgb               m1, m5
-%endif
-    movd            [dstq], m0
-    movd [dstq + dstrideq], m1
-
-    lea               srcq, [srcq + sstrideq        ]
-    prefetcht0              [srcq + 4 * sstrideq - 3]
-    lea               srcq, [srcq + sstrideq        ]
-    lea               dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0              [srcq + 2 * sstrideq - 3]
-
-    sub            heightd, 2
-    jg               .loop
-
-    ; Do last row if output_height is odd
-    jne              .done
-
-    movu                m4, [srcq - 3]
-    punpckhbw           m1, m4, m4
-    punpcklbw           m4, m4
-    palignr             m0, m1, m4, 1
-    palignr             m1, m4, 5
-    pmaddubsw           m0, k0k1k4k5
-    pmaddubsw           m1, k2k3k6k7
-    psrldq              m2, m0, 8
-    psrldq              m3, m1, 8
-    paddsw              m0, m2
-    paddsw              m1, m3
-    paddsw              m0, m1
-    paddsw              m0, krd
-    psraw               m0, 7
-%ifidn %1, h8_add_src
-    pxor                m3, m3
-    movu                m4, [srcq]
-    punpcklbw           m4, m3
-    paddsw              m0, m4
-%endif
-    packuswb            m0, m0
-%ifidn %1, h8_avg
-    movd                m4, [dstq]
-    pavgb               m0, m4
-%endif
-    movd            [dstq], m0
-.done:
-    REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
-                            src, sstride, dst, dstride, height, filter
-    mova                 m4, [filterq]
-    SETUP_LOCAL_VARS
-    dec             heightd
-
-.loop:
-    ;Do two rows at once
-    movu                 m0, [srcq - 3]
-    movu                 m4, [srcq + sstrideq - 3]
-    punpckhbw            m1, m0, m0
-    punpcklbw            m0, m0
-    palignr              m5, m1, m0, 13
-    pmaddubsw            m5, k6k7
-    palignr              m2, m1, m0, 5
-    palignr              m3, m1, m0, 9
-    palignr              m1, m0, 1
-    pmaddubsw            m1, k0k1
-    punpckhbw            m6, m4, m4
-    punpcklbw            m4, m4
-    pmaddubsw            m2, k2k3
-    pmaddubsw            m3, k4k5
-
-    palignr              m7, m6, m4, 13
-    palignr              m0, m6, m4, 5
-    pmaddubsw            m7, k6k7
-    paddsw               m1, m3
-    paddsw               m2, m5
-    paddsw               m1, m2
-%ifidn %1, h8_avg
-    movh                 m2, [dstq]
-    movhps               m2, [dstq + dstrideq]
-%endif
-    palignr              m5, m6, m4, 9
-    palignr              m6, m4, 1
-    pmaddubsw            m0, k2k3
-    pmaddubsw            m6, k0k1
-    paddsw               m1, krd
-    pmaddubsw            m5, k4k5
-    psraw                m1, 7
-    paddsw               m0, m7
-    paddsw               m6, m5
-    paddsw               m6, m0
-    paddsw               m6, krd
-    psraw                m6, 7
-%ifidn %1, h8_add_src
-    pxor                 m3, m3
-    movu                 m4, [srcq]
-    movu                 m5, [srcq + sstrideq]
-    punpcklbw            m4, m3
-    punpcklbw            m5, m3
-    paddsw               m1, m4
-    paddsw               m6, m5
-%endif
-    packuswb             m1, m6
-%ifidn %1, h8_avg
-    pavgb                m1, m2
-%endif
-    movh              [dstq], m1
-    movhps [dstq + dstrideq], m1
-
-    lea                srcq, [srcq + sstrideq        ]
-    prefetcht0               [srcq + 4 * sstrideq - 3]
-    lea                srcq, [srcq + sstrideq        ]
-    lea                dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0               [srcq + 2 * sstrideq - 3]
-    sub             heightd, 2
-    jg                .loop
-
-    ; Do last row if output_height is odd
-    jne               .done
-
-    movu                 m0, [srcq - 3]
-    punpckhbw            m3, m0, m0
-    punpcklbw            m0, m0
-    palignr              m1, m3, m0, 1
-    palignr              m2, m3, m0, 5
-    palignr              m4, m3, m0, 13
-    palignr              m3, m0, 9
-    pmaddubsw            m1, k0k1
-    pmaddubsw            m2, k2k3
-    pmaddubsw            m3, k4k5
-    pmaddubsw            m4, k6k7
-    paddsw               m1, m3
-    paddsw               m4, m2
-    paddsw               m1, m4
-    paddsw               m1, krd
-    psraw                m1, 7
-%ifidn %1, h8_add_src
-    pxor                 m6, m6
-    movu                 m5, [srcq]
-    punpcklbw            m5, m6
-    paddsw               m1, m5
-%endif
-    packuswb             m1, m1
-%ifidn %1, h8_avg
-    movh                 m0, [dstq]
-    pavgb                m1, m0
-%endif
-    movh             [dstq], m1
-.done:
-    REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-
-.loop:
-    prefetcht0        [srcq + 2 * sstrideq -3]
-
-    movu          m0, [srcq - 3]
-    movu          m4, [srcq - 2]
-    pmaddubsw     m0, k0k1
-    pmaddubsw     m4, k0k1
-    movu          m1, [srcq - 1]
-    movu          m5, [srcq + 0]
-    pmaddubsw     m1, k2k3
-    pmaddubsw     m5, k2k3
-    movu          m2, [srcq + 1]
-    movu          m6, [srcq + 2]
-    pmaddubsw     m2, k4k5
-    pmaddubsw     m6, k4k5
-    movu          m3, [srcq + 3]
-    movu          m7, [srcq + 4]
-    pmaddubsw     m3, k6k7
-    pmaddubsw     m7, k6k7
-    paddsw        m0, m2
-    paddsw        m1, m3
-    paddsw        m0, m1
-    paddsw        m4, m6
-    paddsw        m5, m7
-    paddsw        m4, m5
-    paddsw        m0, krd
-    paddsw        m4, krd
-    psraw         m0, 7
-    psraw         m4, 7
-%ifidn %1, h8_add_src
-%if ARCH_X86=1 && CONFIG_PIC=1
-    pcmpeqb       m2, m2                  ;all ones
-    psrlw         m2, 8                   ;even_byte_mask
-%else
-    mova          m2, [GLOBAL(even_byte_mask)]
-%endif
-    movu          m5, [srcq]
-    mova          m7, m5
-    pand          m5, m2
-    psrlw         m7, 8
-    paddsw        m0, m5
-    paddsw        m4, m7
-%endif
-    packuswb      m0, m0
-    packuswb      m4, m4
-    punpcklbw     m0, m4
-%ifidn %1, h8_avg
-    pavgb         m0, [dstq]
-%endif
-    lea         srcq, [srcq + sstrideq]
-    mova      [dstq], m0
-    lea         dstq, [dstq + dstrideq]
-    dec      heightd
-    jnz        .loop
-    REP_RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER4  h8
-
-;-------------------------------------------------------------------------------
-
-; TODO(Linfeng): Detect cpu type and choose the code with better performance.
-%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
-
-%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-    %define NUM_GENERAL_REG_USED 9
-%else
-    %define NUM_GENERAL_REG_USED 6
-%endif
-
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-
-%ifidn %2, 8
-    %define                movx  movh
-%else
-    %define                movx  movd
-%endif
-
-    dec                 heightd
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
-    %define               src1q  r7
-    %define           sstride6q  r8
-    %define          dst_stride  dstrideq
-%else
-    %define               src1q  filterq
-    %define           sstride6q  dstrideq
-    %define          dst_stride  dstridemp
-%endif
-    mov                   src1q, srcq
-    add                   src1q, sstrideq
-    lea               sstride6q, [sstrideq + sstrideq * 4]
-    add               sstride6q, sstrideq                   ;pitch * 6
-
-.loop:
-    ;Do two rows at once
-    movx                     m0, [srcq                ]     ;A
-    movx                     m1, [src1q               ]     ;B
-    punpcklbw                m0, m1                         ;A B
-    movx                     m2, [srcq + sstrideq * 2 ]     ;C
-    pmaddubsw                m0, k0k1
-    mova                     m6, m2
-    movx                     m3, [src1q + sstrideq * 2]     ;D
-    punpcklbw                m2, m3                         ;C D
-    pmaddubsw                m2, k2k3
-    movx                     m4, [srcq + sstrideq * 4 ]     ;E
-    mova                     m7, m4
-    movx                     m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw                m4, m5                         ;E F
-    pmaddubsw                m4, k4k5
-    punpcklbw                m1, m6                         ;A B next iter
-    movx                     m6, [srcq + sstride6q    ]     ;G
-    punpcklbw                m5, m6                         ;E F next iter
-    punpcklbw                m3, m7                         ;C D next iter
-    pmaddubsw                m5, k4k5
-    movx                     m7, [src1q + sstride6q   ]     ;H
-    punpcklbw                m6, m7                         ;G H
-    pmaddubsw                m6, k6k7
-    pmaddubsw                m3, k2k3
-    pmaddubsw                m1, k0k1
-    paddsw                   m0, m4
-    paddsw                   m2, m6
-    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
-    punpcklbw                m7, m6
-    pmaddubsw                m7, k6k7
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    psraw                    m0, 7
-    paddsw                   m1, m5
-%ifidn %1, v8_add_src
-    pxor                     m6, m6
-    movu                     m4, [srcq]
-    punpcklbw                m4, m6
-    paddsw                   m0, m4
-%endif
-    packuswb                 m0, m0
-
-    paddsw                   m3, m7
-    paddsw                   m1, m3
-    paddsw                   m1, krd
-    psraw                    m1, 7
-%ifidn %1, v8_add_src
-    movu                     m4, [src1q]
-    punpcklbw                m4, m6
-    paddsw                   m1, m4
-%endif
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    lea                   src1q, [src1q + sstrideq * 2]
-    packuswb                 m1, m1
-
-%ifidn %1, v8_avg
-    movx                     m2, [dstq]
-    pavgb                    m0, m2
-%endif
-    movx                 [dstq], m0
-    add                    dstq, dst_stride
-%ifidn %1, v8_avg
-    movx                     m3, [dstq]
-    pavgb                    m1, m3
-%endif
-    movx                 [dstq], m1
-    add                    dstq, dst_stride
-    sub                 heightd, 2
-    jg                    .loop
-
-    ; Do last row if output_height is odd
-    jne                   .done
-
-    movx                     m0, [srcq                ]     ;A
-    movx                     m1, [srcq + sstrideq     ]     ;B
-    movx                     m6, [srcq + sstride6q    ]     ;G
-    punpcklbw                m0, m1                         ;A B
-    movx                     m7, [src1q + sstride6q   ]     ;H
-    pmaddubsw                m0, k0k1
-    movx                     m2, [srcq + sstrideq * 2 ]     ;C
-    punpcklbw                m6, m7                         ;G H
-    movx                     m3, [src1q + sstrideq * 2]     ;D
-    pmaddubsw                m6, k6k7
-    movx                     m4, [srcq + sstrideq * 4 ]     ;E
-    punpcklbw                m2, m3                         ;C D
-    movx                     m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw                m4, m5                         ;E F
-    pmaddubsw                m2, k2k3
-    pmaddubsw                m4, k4k5
-    paddsw                   m2, m6
-    paddsw                   m0, m4
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    psraw                    m0, 7
-%ifidn %1, v8_add_src
-    pxor                     m6, m6
-    movu                     m4, [srcq]
-    punpcklbw                m4, m6
-    paddsw                   m0, m4
-%endif
-    packuswb                 m0, m0
-%ifidn %1, v8_avg
-    movx                     m1, [dstq]
-    pavgb                    m0, m1
-%endif
-    movx                 [dstq], m0
-
-%else
-    ; ARCH_X86_64
-
-    movx                     m0, [srcq                ]     ;A
-    movx                     m1, [srcq + sstrideq     ]     ;B
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                     m2, [srcq]                     ;C
-    movx                     m3, [srcq + sstrideq]          ;D
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                     m4, [srcq]                     ;E
-    movx                     m5, [srcq + sstrideq]          ;F
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                     m6, [srcq]                     ;G
-    punpcklbw                m0, m1                         ;A B
-    punpcklbw                m1, m2                         ;A B next iter
-    punpcklbw                m2, m3                         ;C D
-    punpcklbw                m3, m4                         ;C D next iter
-    punpcklbw                m4, m5                         ;E F
-    punpcklbw                m5, m6                         ;E F next iter
-
-.loop:
-    ;Do two rows at once
-    movx                     m7, [srcq + sstrideq]          ;H
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                    m14, [srcq]                     ;H next iter
-    punpcklbw                m6, m7                         ;G H
-    punpcklbw                m7, m14                        ;G H next iter
-    pmaddubsw                m8, m0, k0k1
-    pmaddubsw                m9, m1, k0k1
-    mova                     m0, m2
-    mova                     m1, m3
-    pmaddubsw               m10, m2, k2k3
-    pmaddubsw               m11, m3, k2k3
-    mova                     m2, m4
-    mova                     m3, m5
-    pmaddubsw                m4, k4k5
-    pmaddubsw                m5, k4k5
-    paddsw                   m8, m4
-    paddsw                   m9, m5
-    mova                     m4, m6
-    mova                     m5, m7
-    pmaddubsw                m6, k6k7
-    pmaddubsw                m7, k6k7
-    paddsw                  m10, m6
-    paddsw                  m11, m7
-    paddsw                   m8, m10
-    paddsw                   m9, m11
-    mova                     m6, m14
-    paddsw                   m8, krd
-    paddsw                   m9, krd
-    psraw                    m8, 7
-    psraw                    m9, 7
-%ifidn %2, 4
-    packuswb                 m8, m8
-    packuswb                 m9, m9
-%else
-    packuswb                 m8, m9
-%endif
-
-%ifidn %1, v8_avg
-    movx                     m7, [dstq]
-%ifidn %2, 4
-    movx                    m10, [dstq + dstrideq]
-    pavgb                    m9, m10
-%else
-    movhpd                   m7, [dstq + dstrideq]
-%endif
-    pavgb                    m8, m7
-%endif
-    movx                 [dstq], m8
-%ifidn %2, 4
-    movx      [dstq + dstrideq], m9
-%else
-    movhpd    [dstq + dstrideq], m8
-%endif
-
-    lea                    dstq, [dstq + dstrideq * 2 ]
-    sub                 heightd, 2
-    jg                    .loop
-
-    ; Do last row if output_height is odd
-    jne                   .done
-
-    movx                     m7, [srcq + sstrideq]          ;H
-    punpcklbw                m6, m7                         ;G H
-    pmaddubsw                m0, k0k1
-    pmaddubsw                m2, k2k3
-    pmaddubsw                m4, k4k5
-    pmaddubsw                m6, k6k7
-    paddsw                   m0, m4
-    paddsw                   m2, m6
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    psraw                    m0, 7
-    packuswb                 m0, m0
-%ifidn %1, v8_avg
-    movx                     m1, [dstq]
-    pavgb                    m0, m1
-%endif
-    movx                 [dstq], m0
-
-%endif ; ARCH_X86_64
-
-.done:
-    REP_RET
-
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova                     m4, [filterq]
-    SETUP_LOCAL_VARS
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
-    %define               src1q  r7
-    %define           sstride6q  r8
-    %define          dst_stride  dstrideq
-%else
-    %define               src1q  filterq
-    %define           sstride6q  dstrideq
-    %define          dst_stride  dstridemp
-%endif
-    lea                   src1q, [srcq + sstrideq]
-    lea               sstride6q, [sstrideq + sstrideq * 4]
-    add               sstride6q, sstrideq                   ;pitch * 6
-
-.loop:
-    movh                     m0, [srcq                ]     ;A
-    movh                     m1, [src1q               ]     ;B
-    movh                     m2, [srcq + sstrideq * 2 ]     ;C
-    movh                     m3, [src1q + sstrideq * 2]     ;D
-    movh                     m4, [srcq + sstrideq * 4 ]     ;E
-    movh                     m5, [src1q + sstrideq * 4]     ;F
-
-    punpcklbw                m0, m1                         ;A B
-    movh                     m6, [srcq + sstride6q]         ;G
-    punpcklbw                m2, m3                         ;C D
-    movh                     m7, [src1q + sstride6q]        ;H
-    punpcklbw                m4, m5                         ;E F
-    pmaddubsw                m0, k0k1
-    movh                     m3, [srcq + 8]                 ;A
-    pmaddubsw                m2, k2k3
-    punpcklbw                m6, m7                         ;G H
-    movh                     m5, [srcq + sstrideq + 8]      ;B
-    pmaddubsw                m4, k4k5
-    punpcklbw                m3, m5                         ;A B
-    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
-    pmaddubsw                m6, k6k7
-    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
-    punpcklbw                m7, m5                         ;C D
-    paddsw                   m2, m6
-    pmaddubsw                m3, k0k1
-    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
-    paddsw                   m0, m4
-    pmaddubsw                m7, k2k3
-    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
-    punpcklbw                m1, m6                         ;E F
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    movh                     m2, [srcq + sstride6q + 8]     ;G
-    pmaddubsw                m1, k4k5
-    movh                     m5, [src1q + sstride6q + 8]    ;H
-    psraw                    m0, 7
-    punpcklbw                m2, m5                         ;G H
-    pmaddubsw                m2, k6k7
-    paddsw                   m7, m2
-    paddsw                   m3, m1
-    paddsw                   m3, m7
-    paddsw                   m3, krd
-    psraw                    m3, 7
-%ifidn %1, v8_add_src
-    pxor                     m6, m6
-    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
-    mova                     m5, m4
-    punpcklbw                m4, m6
-    punpckhbw                m5, m6
-    paddsw                   m0, m4
-    paddsw                   m3, m5
-%endif
-    packuswb                 m0, m3
-
-    add                    srcq, sstrideq
-    add                   src1q, sstrideq
-%ifidn %1, v8_avg
-    pavgb                    m0, [dstq]
-%endif
-    mova                 [dstq], m0
-    add                    dstq, dst_stride
-    dec                 heightd
-    jnz                   .loop
-    REP_RET
-
-%else
-    ; ARCH_X86_64
-    dec                 heightd
-
-    movu                     m1, [srcq                ]     ;A
-    movu                     m3, [srcq + sstrideq     ]     ;B
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw                m0, m1, m3                     ;A B
-    punpckhbw                m1, m3                         ;A B
-    movu                     m5, [srcq]                     ;C
-    punpcklbw                m2, m3, m5                     ;A B next iter
-    punpckhbw                m3, m5                         ;A B next iter
-    mova                   tmp0, m2                         ;store to stack
-    mova                   tmp1, m3                         ;store to stack
-    movu                     m7, [srcq + sstrideq]          ;D
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw                m4, m5, m7                     ;C D
-    punpckhbw                m5, m7                         ;C D
-    movu                     m9, [srcq]                     ;E
-    punpcklbw                m6, m7, m9                     ;C D next iter
-    punpckhbw                m7, m9                         ;C D next iter
-    movu                    m11, [srcq + sstrideq]          ;F
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw                m8, m9, m11                    ;E F
-    punpckhbw                m9, m11                        ;E F
-    movu                     m2, [srcq]                     ;G
-    punpcklbw               m10, m11, m2                    ;E F next iter
-    punpckhbw               m11, m2                         ;E F next iter
-
-.loop:
-    ;Do two rows at once
-    pmaddubsw               m13, m0, k0k1
-    mova                     m0, m4
-    pmaddubsw               m14, m8, k4k5
-    pmaddubsw               m15, m4, k2k3
-    mova                     m4, m8
-    paddsw                  m13, m14
-    movu                     m3, [srcq + sstrideq]          ;H
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw               m14, m2, m3                     ;G H
-    mova                     m8, m14
-    pmaddubsw               m14, k6k7
-    paddsw                  m15, m14
-    paddsw                  m13, m15
-    paddsw                  m13, krd
-    psraw                   m13, 7
-
-    pmaddubsw               m14, m1, k0k1
-    pmaddubsw                m1, m9, k4k5
-    pmaddubsw               m15, m5, k2k3
-    paddsw                  m14, m1
-    mova                     m1, m5
-    mova                     m5, m9
-    punpckhbw                m2, m3                         ;G H
-    mova                     m9, m2
-    pmaddubsw                m2, k6k7
-    paddsw                  m15, m2
-    paddsw                  m14, m15
-    paddsw                  m14, krd
-    psraw                   m14, 7
-    packuswb                m13, m14
-%ifidn %1, v8_avg
-    pavgb                   m13, [dstq]
-%endif
-    mova                 [dstq], m13
-
-    ; next iter
-    pmaddubsw               m15, tmp0, k0k1
-    pmaddubsw               m14, m10, k4k5
-    pmaddubsw               m13, m6, k2k3
-    paddsw                  m15, m14
-    mova                   tmp0, m6
-    mova                     m6, m10
-    movu                     m2, [srcq]                     ;G next iter
-    punpcklbw               m14, m3, m2                     ;G H next iter
-    mova                    m10, m14
-    pmaddubsw               m14, k6k7
-    paddsw                  m13, m14
-    paddsw                  m15, m13
-    paddsw                  m15, krd
-    psraw                   m15, 7
-
-    pmaddubsw               m14, tmp1, k0k1
-    mova                   tmp1, m7
-    pmaddubsw               m13, m7, k2k3
-    mova                     m7, m11
-    pmaddubsw               m11, k4k5
-    paddsw                  m14, m11
-    punpckhbw                m3, m2                         ;G H next iter
-    mova                    m11, m3
-    pmaddubsw                m3, k6k7
-    paddsw                  m13, m3
-    paddsw                  m14, m13
-    paddsw                  m14, krd
-    psraw                   m14, 7
-    packuswb                m15, m14
-%ifidn %1, v8_avg
-    pavgb                   m15, [dstq + dstrideq]
-%endif
-    mova      [dstq + dstrideq], m15
-    lea                    dstq, [dstq + dstrideq * 2]
-    sub                 heightd, 2
-    jg                    .loop
-
-    ; Do last row if output_height is odd
-    jne                   .done
-
-    movu                     m3, [srcq + sstrideq]          ;H
-    punpcklbw                m6, m2, m3                     ;G H
-    punpckhbw                m2, m3                         ;G H
-    pmaddubsw                m0, k0k1
-    pmaddubsw                m1, k0k1
-    pmaddubsw                m4, k2k3
-    pmaddubsw                m5, k2k3
-    pmaddubsw                m8, k4k5
-    pmaddubsw                m9, k4k5
-    pmaddubsw                m6, k6k7
-    pmaddubsw                m2, k6k7
-    paddsw                   m0, m8
-    paddsw                   m1, m9
-    paddsw                   m4, m6
-    paddsw                   m5, m2
-    paddsw                   m0, m4
-    paddsw                   m1, m5
-    paddsw                   m0, krd
-    paddsw                   m1, krd
-    psraw                    m0, 7
-    psraw                    m1, 7
-    packuswb                 m0, m1
-%ifidn %1, v8_avg
-    pavgb                    m0, [dstq]
-%endif
-    mova                 [dstq], m0
-
-.done:
-    REP_RET
-
-%endif ; ARCH_X86_64
-
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER       v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
deleted file mode 100644
index d0b4b2839..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklqdq  xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    pxor        xmm2, xmm2
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpcklbw   xmm0, xmm2                  ;unpack to word
-    pmullw      xmm0, xmm4                  ;multiply the filter factors
-
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-
-    paddsw      xmm0, xmm3                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-
-    pshuflw     xmm6, xmm7, 11111111b       ;k3
-    pshufhw     xmm7, xmm7, 0b              ;k4
-    punpcklwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    pxor        xmm5, xmm5
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm4                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-    punpckhbw   xmm2, xmm5
-    punpckhbw   xmm3, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    pmullw      xmm2, xmm6
-    pmullw      xmm3, xmm7
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm2, xmm3
-
-    paddsw      xmm0, xmm4                  ;rounding
-    paddsw      xmm2, xmm4
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-SECTION .text
-
-global sym(aom_filter_block1d4_v2_sse2) PRIVATE
-sym(aom_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_sse2) PRIVATE
-sym(aom_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_sse2) PRIVATE
-sym(aom_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d4_h2_sse2) PRIVATE
-sym(aom_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_sse2) PRIVATE
-sym(aom_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_sse2) PRIVATE
-sym(aom_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
deleted file mode 100644
index 59edc49a9..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,267 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         ecx, 0x01000100
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    psrldq      xmm3, 6
-    packsswb    xmm3, xmm3
-    pshuflw     xmm3, xmm3, 0b              ;k3_k4
-
-    movd        xmm2, ecx                   ;rounding_shift
-    pshufd      xmm2, xmm2, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm3
-
-    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         ecx, 0x01000100
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    psrldq      xmm7, 6
-    packsswb    xmm7, xmm7
-    pshuflw     xmm7, xmm7, 0b              ;k3_k4
-    punpcklwd   xmm7, xmm7
-
-    movd        xmm6, ecx                   ;rounding_shift
-    pshufd      xmm6, xmm6, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm7
-
-    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
-    packuswb    xmm0, xmm0                  ;pack back to byte
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm1
-    punpckhbw   xmm2, xmm1
-    pmaddubsw   xmm0, xmm7
-    pmaddubsw   xmm2, xmm7
-
-    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
-    pmulhrsw    xmm2, xmm6
-    packuswb    xmm0, xmm2                  ;pack back to byte
-
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-SECTION .text
-
-global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
-sym(aom_filter_block1d4_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
-sym(aom_filter_block1d8_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
-sym(aom_filter_block1d16_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
-sym(aom_filter_block1d4_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
-sym(aom_filter_block1d8_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
-sym(aom_filter_block1d16_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
deleted file mode 100644
index 4f5e3f8c1..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom/aom_integer.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-// To start out, just dispatch to the function using the 2D mask and
-// pass mask stride as 0. This can be improved upon if necessary.
-
-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, w, h, 0, 0);
-}
-
-void aom_highbd_blend_a64_hmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
-    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int w, int h, int bd) {
-  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
-                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
-                                   bd);
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
deleted file mode 100644
index 67fb4d32b..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
+++ /dev/null
@@ -1,900 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-#include <immintrin.h>  // AVX2
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_dsp/x86/blend_sse4.h"
-#include "aom_dsp/x86/blend_mask_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend_a64_d16_mask_w16_avx2(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
-    int shift) {
-  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
-  const __m256i s0_0 = yy_loadu_256(src0);
-  const __m256i s1_0 = yy_loadu_256(src1);
-  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
-                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
-  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
-                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
-  res0_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
-  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
-  __m256i res = _mm256_packus_epi16(res0, res0);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
-}
-
-static INLINE void blend_a64_d16_mask_w32_avx2(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
-    const __m256i *v_maxval, int shift) {
-  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
-  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
-  const __m256i s0_0 = yy_loadu_256(src0);
-  const __m256i s0_1 = yy_loadu_256(src0 + 16);
-  const __m256i s1_0 = yy_loadu_256(src1);
-  const __m256i s1_1 = yy_loadu_256(src1 + 16);
-  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
-                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
-  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
-                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
-  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
-                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
-  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
-                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
-  res0_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
-  res1_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
-  res1_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
-  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
-  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
-  __m256i res = _mm256_packus_epi16(res0, res1);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  _mm256_storeu_si256((__m256i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m = xx_loadu_128(mask);
-    const __m256i m0 = _mm256_cvtepu8_epi16(m);
-
-    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m = yy_loadu_256(mask + j);
-      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
-      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i two_w = _mm256_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m256i m_i00 = yy_loadu_256(mask);
-    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
-
-    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
-    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
-    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-
-    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i two_w = _mm256_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
-      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
-      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
-
-      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
-      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
-      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
-      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
-      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
-      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-
-      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
-      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
-      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
-      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + j);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
-      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
-      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
-
-      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + j);
-      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
-
-      const __m256i m_ac =
-          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
-      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
-      const __m256i m1 =
-          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-void aom_lowbd_blend_a64_d16_mask_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  const int round_offset =
-      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
-       (1 << (round_bits - 1)))
-      << AOM_BLEND_A64_ROUND_BITS;
-
-  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
-
-  if (subw == 0 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 1) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  }
-}
-
-static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
-                                       const __m256i *v_m0_b,
-                                       const __m256i *v_m1_b,
-                                       const int32_t bits) {
-  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
-  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
-  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
-  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
-
-  const __m256i v_p0_w =
-      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
-                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
-  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
-  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
-  return v_res;
-}
-
-static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
-                                       const __m256i *v_m0_b,
-                                       const __m256i *v_m1_b,
-                                       const int32_t bits) {
-  const __m256i v_s0_b = yy_loadu_256(src0);
-  const __m256i v_s1_b = yy_loadu_256(src1);
-
-  const __m256i v_p0_w =
-      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
-                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-  const __m256i v_p1_w =
-      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
-                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
-  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
-  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
-  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
-  return v_res;
-}
-
-static INLINE void blend_a64_mask_sx_sy_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    const __m256i v_ral_b = yy_loadu_256(mask);
-    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
-    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
-    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
-    const __m256i v_rvsbl_w =
-        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
-    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-
-    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
-    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
-    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
-                                             AOM_BLEND_A64_ROUND_BITS);
-
-    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
-      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
-      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
-      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
-      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
-      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
-      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
-      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
-      const __m256i v_rvsbl_w =
-          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
-      const __m256i v_rvsbh_w =
-          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
-      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
-      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
-      const __m256i v_m0_b =
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_ra_b = xx_loadl_64(mask);
-        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_ra_b = xx_loadu_128(mask);
-        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h);
-      break;
-  }
-}
-
-static INLINE void blend_a64_mask_sx_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
-  do {
-    const __m256i v_rl_b = yy_loadu_256(mask);
-    const __m256i v_al_b =
-        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
-
-    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
-    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
-    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
-                                             AOM_BLEND_A64_ROUND_BITS);
-
-    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
-      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
-      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
-      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
-      const __m256i v_al_b =
-          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
-      const __m256i v_ah_b =
-          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
-
-      const __m256i v_m0_b =
-          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_r_b = xx_loadl_64(mask);
-        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_r_b = xx_loadu_128(mask);
-        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h);
-      break;
-  }
-}
-
-static INLINE void blend_a64_mask_sy_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storeu_128(dst, v_res_b);
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_ra_b = yy_loadu_256(mask + c);
-      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
-      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_ra_b = xx_loadl_32(mask);
-        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_ra_b = xx_loadl_64(mask);
-        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h);
-  }
-}
-
-static INLINE void blend_a64_mask_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_m0_b = yy_loadu_256(mask + c);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_m0_b = xx_loadl_32(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_m0_b = xx_loadl_64(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      do {
-        const __m128i v_m0_b = xx_loadu_128(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storeu_128(dst, v_res_b);
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    default:
-      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h);
-  }
-}
-
-void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
-                             const uint8_t *src0, uint32_t src0_stride,
-                             const uint8_t *src1, uint32_t src1_stride,
-                             const uint8_t *mask, uint32_t mask_stride, int w,
-                             int h, int subx, int suby) {
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
-  } else {
-    if (subx & suby) {
-      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, mask_stride, w, h);
-    } else if (subx) {
-      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
-                             src1_stride, mask, mask_stride, w, h);
-    } else if (suby) {
-      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
-                             src1_stride, mask, mask_stride, w, h);
-    } else {
-      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                          mask, mask_stride, w, h);
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
deleted file mode 100644
index 9d6b4c2f7..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ /dev/null
@@ -1,1109 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-#include "aom_dsp/x86/blend_mask_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int w, int h) {
-  (void)w;
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int w, int h) {
-  (void)w;
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_m0_b = xx_loadl_64(mask);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_m0_b = xx_loadu_128(mask + c);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_r_b = xx_loadu_128(mask);
-    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
-      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
-      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
-      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
-      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ra_b = xx_loadu_128(mask + c);
-      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
-      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
-      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
-      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
-      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
-      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
-      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
-      const __m128i v_rvsbl_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
-      const __m128i v_rvsbh_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
-      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
-      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
-      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                               const uint8_t *src0, uint32_t src0_stride,
-                               const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int w,
-                               int h, int subx, int suby) {
-  typedef void (*blend_fn)(
-      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
-      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
-  // Dimensions are: width_index X subx X suby
-  static const blend_fn blend[3][2][2] = {
-    { // w % 16 == 0
-      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
-      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
-    { // w == 4
-      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
-      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
-    { // w == 8
-      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
-      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
-  } else {
-    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
-                                              src0_stride, src1, src1_stride,
-                                              mask, mask_stride, w, h);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h, blend_4_b10);
-}
-
-static void blend_a64_mask_b12_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_m0_b = xx_loadl_64(mask + c);
-      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h,
-                               blend_8_b10);
-}
-
-static void blend_a64_mask_b12_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h,
-                               blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = xx_loadl_64(mask + c);
-      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
-      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h,
-                                    blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h,
-                                    blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
-      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-      const __m128i v_rvsb_w =
-          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h,
-                                     blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h,
-                                     blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
-                                      const uint8_t *src0_8,
-                                      uint32_t src0_stride,
-                                      const uint8_t *src1_8,
-                                      uint32_t src1_stride, const uint8_t *mask,
-                                      uint32_t mask_stride, int w, int h,
-                                      int subx, int suby, int bd) {
-  typedef void (*blend_fn)(
-      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
-      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
-  // Dimensions are: bd_index X width_index X subx X suby
-  static const blend_fn blend[2][2][2][2] = {
-    {   // bd == 8 or 10
-      { // w % 8 == 0
-        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
-        { blend_a64_mask_b10_sx_w8n_sse4_1,
-          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
-      { // w == 4
-        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
-        { blend_a64_mask_b10_sx_w4_sse4_1,
-          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
-    {   // bd == 12
-      { // w % 8 == 0
-        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
-        { blend_a64_mask_b12_sx_w8n_sse4_1,
-          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
-      { // w == 4
-        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
-        { blend_a64_mask_b12_sx_w4_sse4_1,
-          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
-  };
-
-  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
-  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, w, h, subx,
-                                suby, bd);
-  } else {
-    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
-        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-        mask_stride, w, h);
-  }
-}
-
-static INLINE void blend_a64_d16_mask_w16_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
-    const __m128i *v_maxval, int shift) {
-  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
-  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
-  const __m128i s0_0 = xx_loadu_128(src0);
-  const __m128i s0_1 = xx_loadu_128(src0 + 8);
-  const __m128i s1_0 = xx_loadu_128(src1);
-  const __m128i s1_1 = xx_loadu_128(src1 + 8);
-  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
-                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
-  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
-                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
-  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
-                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
-  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
-                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
-  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
-  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
-  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
-  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
-  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
-  const __m128i res = _mm_packus_epi16(res0, res1);
-
-  _mm_storeu_si128((__m128i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m = xx_loadu_128(mask + j);
-      const __m128i m0 = _mm_cvtepu8_epi16(m);
-      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
-      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
-      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
-
-      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
-      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
-      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
-      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
-      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
-      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
-      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
-      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
-      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
-      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
-      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + j);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
-      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
-      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
-      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-void aom_lowbd_blend_a64_d16_mask_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  const int round_offset =
-      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
-       (1 << (round_bits - 1)))
-      << AOM_BLEND_A64_ROUND_BITS;
-
-  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-
-  if (subw == 0 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-
-  } else if (subw == 1 && subh == 1) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
deleted file mode 100644
index 064910232..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                        const uint8_t *src0,
-                                        uint32_t src0_stride,
-                                        const uint8_t *src1,
-                                        uint32_t src1_stride,
-                                        const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h);
-
-  // Dimension: width_index
-  static const blend_fn blend[9] = {
-    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
-    aom_blend_a64_vmask_c,        // w == 1
-    aom_blend_a64_vmask_c,        // w == 2
-    NULL,                         // INVALID
-    blend_a64_vmask_w4_sse4_1,    // w == 4
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    blend_a64_vmask_w8_sse4_1,    // w == 8
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
-                 h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_vmask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                          const uint16_t *src0,
-                                          uint32_t src0_stride,
-                                          const uint16_t *src1,
-                                          uint32_t src1_stride,
-                                          const uint8_t *mask, int w, int h) {
-  (void)w;
-  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h, blend_4_b10);
-}
-
-static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                          const uint16_t *src0,
-                                          uint32_t src0_stride,
-                                          const uint16_t *src1,
-                                          uint32_t src1_stride,
-                                          const uint8_t *mask, int w, int h) {
-  (void)w;
-  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                           const uint16_t *src0,
-                                           uint32_t src0_stride,
-                                           const uint16_t *src1,
-                                           uint32_t src1_stride,
-                                           const uint8_t *mask, int w, int h) {
-  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, w, h, blend_8_b10);
-}
-
-static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                           const uint16_t *src0,
-                                           uint32_t src0_stride,
-                                           const uint16_t *src1,
-                                           uint32_t src1_stride,
-                                           const uint8_t *mask, int w, int h) {
-  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, w, h, blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_vmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
-    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int w, int h, int bd) {
-  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
-                           const uint16_t *src0, uint32_t src0_stride,
-                           const uint16_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h);
-
-  // Dimensions are: bd_index X width_index
-  static const blend_fn blend[2][2] = {
-    {
-        // bd == 8 or 10
-        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
-        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
-    },
-    {
-        // bd == 12
-        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
-        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
-    }
-  };
-
-  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
-  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                 src1_stride, mask, w, h, bd);
-  } else {
-    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, w, h);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
deleted file mode 100644
index c071fdcfc..000000000
--- a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
-#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend_a64_d16_mask_w4_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
-    int shift) {
-  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
-  const __m128i s0 = xx_loadl_64(src0);
-  const __m128i s1 = xx_loadl_64(src1);
-  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
-  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
-  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
-  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
-  const __m128i res_d = _mm_srai_epi32(res_c, shift);
-  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
-  const __m128i res = _mm_packus_epi16(res_e, res_e);
-
-  xx_storel_32(dst, res);
-}
-
-static INLINE void blend_a64_d16_mask_w8_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
-    int shift) {
-  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
-  const __m128i s0 = xx_loadu_128(src0);
-  const __m128i s1 = xx_loadu_128(src1);
-  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
-                                  _mm_unpacklo_epi16(*m, max_minus_m));
-  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
-                                  _mm_unpackhi_epi16(*m, max_minus_m));
-  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
-  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
-  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
-  const __m128i res = _mm_packus_epi16(res_e, res_e);
-
-  _mm_storel_epi64((__m128i *)(dst), res);
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m0 = xx_loadl_32(mask);
-    const __m128i m = _mm_cvtepu8_epi16(m0);
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m0 = xx_loadl_64(mask);
-    const __m128i m = _mm_cvtepu8_epi16(m0);
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
-    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
-    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadu_128(mask);
-    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
-    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
-    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
-    const __m128i m = _mm_avg_epu16(m_ac, zeros);
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadu_128(mask);
-    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
-    const __m128i m = _mm_avg_epu16(m_ac, zeros);
-
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
-
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-#endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
deleted file mode 100644
index 8d9b32510..000000000
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-static const uint8_t g_blend_a64_mask_shuffle[32] = {
-  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-};
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
-                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
-                                 const __m128i *rounding) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-
-  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
-  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
-  return v_res;
-}
-
-static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
-                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
-                                 const __m128i *rounding) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
-  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
-  return v_res;
-}
-
-static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
-                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
-                                  const __m128i *rounding) {
-  const __m128i v_s0_b = xx_loadu_128(src0);
-  const __m128i v_s1_b = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
-  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
-  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
-  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
-  return v_res;
-}
-
-typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
-                                 const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  // Interleave
-  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
-  // Scale
-  const __m128i v_ssum_d =
-      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  // Interleave
-  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
-  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
-  // Scale
-  const __m128i v_ssuml_d =
-      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
-  const __m128i v_ssumh_d =
-      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
deleted file mode 100644
index 96fe4ebb6..000000000
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
-#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-// Note: in and out could have the same value
-static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
-  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
-  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
-  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
-  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
-  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
-  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
-  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
-  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
-  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
-  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
-  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
-  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
-  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
-  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
-  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
-  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
-  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
-  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
-  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
-  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
-
-  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
-  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
-  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
-  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
-  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
-  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
-  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
-  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
-
-  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
-  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
-  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
-  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
-  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
-  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
-  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
-  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
-  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
-  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
-  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
-  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
-  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
-  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
-  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
-  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
-  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
-  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
-  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
-  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
-  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
-  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
-  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
-  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
-
-  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
-  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
-  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
-  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
-  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
-  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
-  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
-  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
-
-  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
-  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
-  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
-  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
-  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
-  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
-  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
-  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
-  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
-  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
-  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
-  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
-  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
-  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
-  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
-  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
-  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
-
-  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
-  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
-  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
-  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
-  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
-  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
-  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
-  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
-
-  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
-  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
-  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
-  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
-  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
-  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
-  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
-  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
-  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
-  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
-  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
-  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
-  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
-  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
-  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
-  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
deleted file mode 100644
index 3e19682cd..000000000
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                uint32_t output_height, const int16_t *filter);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
-  void aom_convolve8_##name##_##opt(                                         \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    (void)filter_x;                                                          \
-    (void)x_step_q4;                                                         \
-    (void)filter_y;                                                          \
-    (void)y_step_q4;                                                         \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
-    assert(step_q4 == 16);                                                   \
-    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
-        (filter[2] | filter[5])) {                                           \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    } else if (filter[0] | filter[1] | filter[2]) {                          \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    } else {                                                                 \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    }                                                                        \
-    if (w) {                                                                 \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
-                               x_step_q4, filter_y, y_step_q4, w, h);        \
-    }                                                                        \
-  }
-
-typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
-                                       const ptrdiff_t src_pitch,
-                                       uint16_t *output_ptr,
-                                       ptrdiff_t out_pitch,
-                                       unsigned int output_height,
-                                       const int16_t *filter, int bd);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)  \
-  void aom_highbd_convolve8_##name##_##opt(                                \
-      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,            \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,        \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {      \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
-    if (step_q4 == 16 && filter[3] != 128) {                               \
-      if (filter[0] | filter[1] | filter[2]) {                             \
-        while (w >= 16) {                                                  \
-          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      } else {                                                             \
-        while (w >= 16) {                                                  \
-          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);            \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter, bd);            \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter, bd);            \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      }                                                                    \
-    }                                                                      \
-    if (w) {                                                               \
-      aom_highbd_convolve8_##name##_c(                                     \
-          CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst),    \
-          dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
-    }                                                                      \
-  }
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
deleted file mode 100644
index 30253f65c..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
-
-// filters for 16
-DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
-  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
-  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
-  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
-  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
-  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
-  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
-  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
-  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
-  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
-  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
-  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-};
-
-static INLINE void prepare_coeffs_lowbd(
-    const InterpFilterParams *const filter_params, const int subpel_q4,
-    __m256i *const coeffs /* [4] */) {
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
-  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
-
-  // right shift all filter co-efficients by 1 to reduce the bits required.
-  // This extra right shift will be taken care of at the end while rounding
-  // the result.
-  // Since all filter co-efficients are even, this change will not affect the
-  // end result
-  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
-                            _mm_set1_epi16(0xffff)));
-
-  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
-}
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
-                                  const int subpel_q4,
-                                  __m256i *const coeffs /* [4] */) {
-  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-
-  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
-  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
-}
-
-static INLINE __m256i convolve_lowbd(const __m256i *const s,
-                                     const __m256i *const coeffs) {
-  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
-  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
-  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
-  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
-
-  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
-                                       _mm256_add_epi16(res_23, res_67));
-
-  return res;
-}
-
-static INLINE __m256i convolve(const __m256i *const s,
-                               const __m256i *const coeffs) {
-  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
-  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
-  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
-  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
-
-  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
-                                       _mm256_add_epi32(res_2, res_3));
-
-  return res;
-}
-
-static INLINE __m256i convolve_lowbd_x(const __m256i data,
-                                       const __m256i *const coeffs,
-                                       const __m256i *const filt) {
-  __m256i s[4];
-
-  s[0] = _mm256_shuffle_epi8(data, filt[0]);
-  s[1] = _mm256_shuffle_epi8(data, filt[1]);
-  s[2] = _mm256_shuffle_epi8(data, filt[2]);
-  s[3] = _mm256_shuffle_epi8(data, filt[3]);
-
-  return convolve_lowbd(s, coeffs);
-}
-
-static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
-                                         const __m256i *const res,
-                                         const int do_average) {
-  __m256i d;
-  if (do_average) {
-    d = _mm256_load_si256((__m256i *)dst);
-    d = _mm256_add_epi32(d, *res);
-    d = _mm256_srai_epi32(d, 1);
-  } else {
-    d = *res;
-  }
-  _mm256_store_si256((__m256i *)dst, d);
-}
-
-static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
-                               const __m256i *const res_unsigned,
-                               const __m256i *const wt,
-                               const int use_jnt_comp_avg) {
-  __m256i res;
-  if (use_jnt_comp_avg) {
-    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
-    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
-
-    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
-    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
-
-    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-
-    res = _mm256_packs_epi32(res_lo, res_hi);
-  } else {
-    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
-    res = _mm256_srai_epi16(wt_res, 1);
-  }
-  return res;
-}
-
-static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
-                                        const __m256i *const offset_const,
-                                        const __m256i *const round_const,
-                                        const int round_shift) {
-  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
-  const __m256i res_round = _mm256_srai_epi16(
-      _mm256_add_epi16(res_signed, *round_const), round_shift);
-  return res_round;
-}
-
-static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
-                                      const __m256i *const res_unsigned,
-                                      const __m256i *const wt0,
-                                      const __m256i *const wt1,
-                                      const int use_jnt_comp_avg) {
-  __m256i res;
-  if (use_jnt_comp_avg) {
-    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
-    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
-    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
-    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
-  } else {
-    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
-    res = _mm256_srai_epi32(wt_res, 1);
-  }
-  return res;
-}
-
-static INLINE __m256i highbd_convolve_rounding(
-    const __m256i *const res_unsigned, const __m256i *const offset_const,
-    const __m256i *const round_const, const int round_shift) {
-  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
-  const __m256i res_round = _mm256_srai_epi32(
-      _mm256_add_epi32(res_signed, *round_const), round_shift);
-
-  return res_round;
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
deleted file mode 100644
index 707bd2d78..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
-                             const int do_average) {
-  __m128i d;
-  if (do_average) {
-    d = _mm_load_si128((__m128i *)dst);
-    d = _mm_add_epi32(d, *res);
-    d = _mm_srai_epi32(d, 1);
-  } else {
-    d = *res;
-  }
-  _mm_store_si128((__m128i *)dst, d);
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
deleted file mode 100644
index 445d04b10..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
-                                  const int subpel_q4,
-                                  __m128i *const coeffs /* [4] */) {
-  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
-}
-
-static INLINE __m128i convolve(const __m128i *const s,
-                               const __m128i *const coeffs) {
-  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
-  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
-  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
-  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
-
-  const __m128i res =
-      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
-
-  return res;
-}
-
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
-  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
-  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
-  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
-  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
-  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
-  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
-                               const __m128i *const res_unsigned,
-                               const __m128i *const wt,
-                               const int use_jnt_comp_avg) {
-  __m128i res;
-  if (use_jnt_comp_avg) {
-    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
-    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
-
-    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
-    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
-
-    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-
-    res = _mm_packs_epi32(res_lo, res_hi);
-  } else {
-    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
-    res = _mm_srai_epi16(wt_res, 1);
-  }
-  return res;
-}
-
-static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
-                                        const __m128i *const offset_const,
-                                        const __m128i *const round_const,
-                                        const int round_shift) {
-  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
-  const __m128i res_round =
-      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
-  return res_round;
-}
-
-static INLINE __m128i highbd_convolve_rounding_sse2(
-    const __m128i *const res_unsigned, const __m128i *const offset_const,
-    const __m128i *const round_const, const int round_shift) {
-  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
-  const __m128i res_round =
-      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
-
-  return res_round;
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
deleted file mode 100644
index 6b8388d84..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
-                                  const __m128i *const res,
-                                  const __m128i *const wt0,
-                                  const __m128i *const wt1,
-                                  const int do_average) {
-  __m128i d;
-  if (do_average) {
-    d = _mm_load_si128((__m128i *)dst);
-    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
-    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
-  } else {
-    d = *res;
-  }
-  _mm_store_si128((__m128i *)dst, d);
-}
-
-static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
-                                             const __m128i *const res_unsigned,
-                                             const __m128i *const wt0,
-                                             const __m128i *const wt1,
-                                             const int use_jnt_comp_avg) {
-  __m128i res;
-  if (use_jnt_comp_avg) {
-    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
-    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
-
-    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
-    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
-  } else {
-    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
-    res = _mm_srai_epi32(wt_res, 1);
-  }
-  return res;
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
deleted file mode 100644
index 54da02253..000000000
--- a/third_party/aom/aom_dsp/x86/fft_avx2.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-extern void aom_transpose_float_sse2(const float *A, float *B, int n);
-extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
-                                          int n);
-
-// Generate the 1d forward transforms for float using _mm256
-GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-          _mm256_mul_ps);
-GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
-GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
-
-void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-// Generate the 1d inverse transforms for float using _mm256
-GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
-GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-            _mm256_mul_ps);
-GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-            _mm256_mul_ps);
-
-void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
-                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
-}
-
-void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
-                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
-                  aom_transpose_float_sse2, 8);
-}
-
-void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
-                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
-                  aom_transpose_float_sse2, 8);
-}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
deleted file mode 100644
index 12bdc3e18..000000000
--- a/third_party/aom/aom_dsp/x86/fft_sse2.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
-s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <xmmintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-static INLINE void transpose4x4(const float *A, float *B, const int lda,
-                                const int ldb) {
-  __m128 row1 = _mm_load_ps(&A[0 * lda]);
-  __m128 row2 = _mm_load_ps(&A[1 * lda]);
-  __m128 row3 = _mm_load_ps(&A[2 * lda]);
-  __m128 row4 = _mm_load_ps(&A[3 * lda]);
-  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
-  _mm_store_ps(&B[0 * ldb], row1);
-  _mm_store_ps(&B[1 * ldb], row2);
-  _mm_store_ps(&B[2 * ldb], row3);
-  _mm_store_ps(&B[3 * ldb], row4);
-}
-
-void aom_transpose_float_sse2(const float *A, float *B, int n) {
-  for (int y = 0; y < n; y += 4) {
-    for (int x = 0; x < n; x += 4) {
-      transpose4x4(A + y * n + x, B + x * n + y, n, n);
-    }
-  }
-}
-
-void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
-  const int n2 = n / 2;
-  output[0] = packed[0];
-  output[1] = 0;
-  output[2 * (n2 * n)] = packed[n2 * n];
-  output[2 * (n2 * n) + 1] = 0;
-
-  output[2 * n2] = packed[n2];
-  output[2 * n2 + 1] = 0;
-  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
-  output[2 * (n2 * n + n2) + 1] = 0;
-
-  for (int c = 1; c < n2; ++c) {
-    output[2 * (0 * n + c)] = packed[c];
-    output[2 * (0 * n + c) + 1] = packed[c + n2];
-    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
-    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
-  }
-  for (int r = 1; r < n2; ++r) {
-    output[2 * (r * n + 0)] = packed[r * n];
-    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
-    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
-    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
-
-    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
-      output[2 * (r * n + c)] =
-          packed[r * n + c] - packed[(r + n2) * n + c + n2];
-      output[2 * (r * n + c) + 1] =
-          packed[(r + n2) * n + c] + packed[r * n + c + n2];
-    }
-
-    for (int c = 4; c < n2; c += 4) {
-      __m128 real1 = _mm_load_ps(packed + r * n + c);
-      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
-      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
-      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
-      real1 = _mm_sub_ps(real1, real2);
-      imag1 = _mm_add_ps(imag1, imag2);
-      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
-      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
-    }
-
-    int r2 = r + n2;
-    int r3 = n - r2;
-    output[2 * (r2 * n + 0)] = packed[r3 * n];
-    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
-    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
-    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
-    for (int c = 1; c < AOMMIN(4, n2); ++c) {
-      output[2 * (r2 * n + c)] =
-          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
-      output[2 * (r2 * n + c) + 1] =
-          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
-    }
-    for (int c = 4; c < n2; c += 4) {
-      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
-      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
-      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
-      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
-      real1 = _mm_add_ps(real1, real2);
-      imag1 = _mm_sub_ps(imag2, imag1);
-      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
-      _mm_store_ps(output + 2 * (r2 * n + c + 2),
-                   _mm_unpackhi_ps(real1, imag1));
-    }
-  }
-}
-
-// Generate definitions for 1d transforms using float and __mm128
-GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
-GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-
-void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-// Generate definitions for 1d inverse transforms using float and mm128
-GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
-GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-
-void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
-                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
-                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
-                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
-                  aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
-                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
-                  aom_transpose_float_sse2, 4);
-}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
deleted file mode 100644
index 1e3d13ec8..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = ADD_EPI16(in0, in7);
-    const __m128i q1 = ADD_EPI16(in1, in6);
-    const __m128i q2 = ADD_EPI16(in2, in5);
-    const __m128i q3 = ADD_EPI16(in3, in4);
-    const __m128i q4 = SUB_EPI16(in3, in4);
-    const __m128i q5 = SUB_EPI16(in2, in5);
-    const __m128i q6 = SUB_EPI16(in1, in6);
-    const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
-    if (pass == 1) {
-      overflow =
-          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = ADD_EPI16(q0, q3);
-      const __m128i r1 = ADD_EPI16(q1, q2);
-      const __m128i r2 = SUB_EPI16(q1, q2);
-      const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      // Interleave to do the multiply by constants which gets us into 32bits
-      {
-        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-        // Combine
-        res0 = _mm_packs_epi32(w0, w1);
-        res4 = _mm_packs_epi32(w2, w3);
-        res2 = _mm_packs_epi32(w4, w5);
-        res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&r0, &r1);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // Add/subtract
-        const __m128i x0 = ADD_EPI16(q4, r0);
-        const __m128i x1 = SUB_EPI16(q4, r0);
-        const __m128i x2 = SUB_EPI16(q7, r1);
-        const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Interleave to do the multiply by constants which gets us into 32bits
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res1 = _mm_packs_epi32(w0, w1);
-          res7 = _mm_packs_epi32(w2, w3);
-          res5 = _mm_packs_epi32(w4, w5);
-          res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
-          if (overflow) {
-            aom_highbd_fdct8x8_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    store_output(&in0, (output + 0 * 8));
-    store_output(&in1, (output + 1 * 8));
-    store_output(&in2, (output + 2 * 8));
-    store_output(&in3, (output + 3 * 8));
-    store_output(&in4, (output + 4 * 8));
-    store_output(&in5, (output + 5 * 8));
-    store_output(&in6, (output + 6 * 8));
-    store_output(&in7, (output + 7 * 8));
-  }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
deleted file mode 100644
index 2d8f8f71e..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i u0, u1, sum;
-
-  u0 = _mm_add_epi16(in0, in1);
-  u1 = _mm_add_epi16(in2, in3);
-
-  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  sum = _mm_add_epi16(u0, u1);
-
-  in0 = _mm_add_epi16(in0, in1);
-  in2 = _mm_add_epi16(in2, in3);
-  sum = _mm_add_epi16(sum, in0);
-
-  u0 = _mm_setzero_si128();
-  sum = _mm_add_epi16(sum, in2);
-
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT8x8_2D aom_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT8x8_2D
-
-#undef DCT_HIGH_BIT_DEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT8x8_2D
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
deleted file mode 100644
index 260d8dd58..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
-#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
-  __m128i buf0, buf1;
-  buf0 = _mm_mul_epu32(a, b);
-  a = _mm_srli_epi64(a, 32);
-  b = _mm_srli_epi64(b, 32);
-  buf1 = _mm_mul_epu32(a, b);
-  return _mm_add_epi64(buf0, buf1);
-}
-
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
-  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm_unpacklo_epi64(buf0, buf1);
-}
-
-static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
-                                          const __m128i *preg1) {
-  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
-  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
-                              _mm_cmpeq_epi16(*preg0, min_overflow));
-  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
-                              _mm_cmpeq_epi16(*preg1, min_overflow));
-  cmp0 = _mm_or_si128(cmp0, cmp1);
-  return _mm_movemask_epi8(cmp0);
-}
-
-static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
-                                          const __m128i *preg1,
-                                          const __m128i *preg2,
-                                          const __m128i *preg3) {
-  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
-  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
-                              _mm_cmpeq_epi16(*preg0, min_overflow));
-  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
-                              _mm_cmpeq_epi16(*preg1, min_overflow));
-  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
-                              _mm_cmpeq_epi16(*preg2, min_overflow));
-  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
-                              _mm_cmpeq_epi16(*preg3, min_overflow));
-  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
-  return _mm_movemask_epi8(cmp0);
-}
-
-static INLINE int check_epi16_overflow_x8(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x12(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-  return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x16(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0) {
-    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
-  }
-  return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x32(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
-    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
-    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
-    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
-    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
-    const __m128i *preg30, const __m128i *preg31) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0) {
-    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-    if (!res1) {
-      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
-      if (!res0) {
-        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
-        if (!res1) {
-          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
-          if (!res0) {
-            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
-            if (!res1)
-              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
-          }
-        }
-      }
-    }
-  }
-  return res0 + res1;
-}
-
-static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_store_si128((__m128i *)(dst_ptr), out0);
-    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
deleted file mode 100644
index c1fb259a1..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585,  11585
-TRANSFORM_COEFFS 15137,   6270
-TRANSFORM_COEFFS 16069,   3196
-TRANSFORM_COEFFS  9102,  13623
-
-%macro STORE_OUTPUT 2 ; index, result
-  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
-  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  pxor               m11, m11
-  pcmpgtw            m11, m%2
-  movdqa             m12, m%2
-  punpcklwd          m%2, m11
-  punpckhwd          m12, m11
-  mova               [outputq + 4*%1 +  0], m%2
-  mova               [outputq + 4*%1 + 16], m12
-%endmacro
-
-SECTION .text
-
-%if ARCH_X86_64
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
-  mova               m8, [GLOBAL(pd_8192)]
-  mova              m12, [GLOBAL(pw_11585x2)]
-
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  ; left shift by 2 to increase forward transformation precision
-  psllw              m0, 2
-  psllw              m1, 2
-  psllw              m2, 2
-  psllw              m3, 2
-  psllw              m4, 2
-  psllw              m5, 2
-  psllw              m6, 2
-  psllw              m7, 2
-
-  ; column transform
-  ; stage 1
-  paddw m10, m0, m7
-  psubw m0, m7
-
-  paddw m9, m1, m6
-  psubw m1, m6
-
-  paddw m7, m2, m5
-  psubw m2, m5
-
-  paddw m6, m3, m4
-  psubw m3, m4
-
-  ; stage 2
-  paddw m5, m9, m7
-  psubw m9, m7
-
-  paddw m4, m10, m6
-  psubw m10, m6
-
-  paddw m7, m1, m2
-  psubw m1, m2
-
-  ; stage 3
-  paddw m6, m4, m5
-  psubw m4, m5
-
-  pmulhrsw m1, m12
-  pmulhrsw m7, m12
-
-  ; sin(pi / 8), cos(pi / 8)
-  punpcklwd m2, m10, m9
-  punpckhwd m10, m9
-  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
-  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
-  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
-  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
-  paddd m5, m8
-  paddd m2, m8
-  paddd m9, m8
-  paddd m10, m8
-  psrad m5, 14
-  psrad m2, 14
-  psrad m9, 14
-  psrad m10, 14
-  packssdw m5, m9
-  packssdw m2, m10
-
-  pmulhrsw m6, m12
-  pmulhrsw m4, m12
-
-  paddw m9, m3, m1
-  psubw m3, m1
-
-  paddw m10, m0, m7
-  psubw m0, m7
-
-  ; stage 4
-  ; sin(pi / 16), cos(pi / 16)
-  punpcklwd m1, m10, m9
-  punpckhwd m10, m9
-  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
-  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
-  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
-  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
-  paddd m7, m8
-  paddd m1, m8
-  paddd m9, m8
-  paddd m10, m8
-  psrad m7, 14
-  psrad m1, 14
-  psrad m9, 14
-  psrad m10, 14
-  packssdw m7, m9
-  packssdw m1, m10
-
-  ; sin(3 * pi / 16), cos(3 * pi / 16)
-  punpcklwd m11, m0, m3
-  punpckhwd m0, m3
-  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
-  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
-  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
-  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
-  paddd m9, m8
-  paddd m11, m8
-  paddd m3, m8
-  paddd m0, m8
-  psrad m9, 14
-  psrad m11, 14
-  psrad m3, 14
-  psrad m0, 14
-  packssdw m9, m3
-  packssdw m11, m0
-
-  ; transpose
-  ; stage 1
-  punpcklwd m0, m6, m7
-  punpcklwd m3, m5, m11
-  punpckhwd m6, m7
-  punpckhwd m5, m11
-  punpcklwd m7, m4, m9
-  punpcklwd m10, m2, m1
-  punpckhwd m4, m9
-  punpckhwd m2, m1
-
-  ; stage 2
-  punpckldq m9, m0, m3
-  punpckldq m1, m6, m5
-  punpckhdq m0, m3
-  punpckhdq m6, m5
-  punpckldq m3, m7, m10
-  punpckldq m5, m4, m2
-  punpckhdq m7, m10
-  punpckhdq m4, m2
-
-  ; stage 3
-  punpcklqdq m10, m9, m3
-  punpckhqdq m9, m3
-  punpcklqdq m2, m0, m7
-  punpckhqdq m0, m7
-  punpcklqdq m3, m1, m5
-  punpckhqdq m1, m5
-  punpcklqdq m7, m6, m4
-  punpckhqdq m6, m4
-
-  ; row transform
-  ; stage 1
-  paddw m5, m10, m6
-  psubw m10, m6
-
-  paddw m4, m9, m7
-  psubw m9, m7
-
-  paddw m6, m2, m1
-  psubw m2, m1
-
-  paddw m7, m0, m3
-  psubw m0, m3
-
-  ;stage 2
-  paddw m1, m5, m7
-  psubw m5, m7
-
-  paddw m3, m4, m6
-  psubw m4, m6
-
-  paddw m7, m9, m2
-  psubw m9, m2
-
-  ; stage 3
-  punpcklwd m6, m1, m3
-  punpckhwd m1, m3
-  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
-  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
-  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
-  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
-  paddd m2, m8
-  paddd m6, m8
-  paddd m3, m8
-  paddd m1, m8
-  psrad m2, 14
-  psrad m6, 14
-  psrad m3, 14
-  psrad m1, 14
-  packssdw m2, m3
-  packssdw m6, m1
-
-  pmulhrsw m7, m12
-  pmulhrsw m9, m12
-
-  punpcklwd m3, m5, m4
-  punpckhwd m5, m4
-  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
-  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
-  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
-  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
-  paddd m1, m8
-  paddd m3, m8
-  paddd m4, m8
-  paddd m5, m8
-  psrad m1, 14
-  psrad m3, 14
-  psrad m4, 14
-  psrad m5, 14
-  packssdw m1, m4
-  packssdw m3, m5
-
-  paddw m4, m0, m9
-  psubw m0, m9
-
-  paddw m5, m10, m7
-  psubw m10, m7
-
-  ; stage 4
-  punpcklwd m9, m5, m4
-  punpckhwd m5, m4
-  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
-  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
-  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
-  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
-  paddd m7, m8
-  paddd m9, m8
-  paddd m4, m8
-  paddd m5, m8
-  psrad m7, 14
-  psrad m9, 14
-  psrad m4, 14
-  psrad m5, 14
-  packssdw m7, m4
-  packssdw m9, m5
-
-  punpcklwd m4, m10, m0
-  punpckhwd m10, m0
-  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
-  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
-  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
-  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
-  paddd m5, m8
-  paddd m4, m8
-  paddd m0, m8
-  paddd m10, m8
-  psrad m5, 14
-  psrad m4, 14
-  psrad m0, 14
-  psrad m10, 14
-  packssdw m5, m0
-  packssdw m4, m10
-
-  ; transpose
-  ; stage 1
-  punpcklwd m0, m2, m7
-  punpcklwd m10, m1, m4
-  punpckhwd m2, m7
-  punpckhwd m1, m4
-  punpcklwd m7, m6, m5
-  punpcklwd m4, m3, m9
-  punpckhwd m6, m5
-  punpckhwd m3, m9
-
-  ; stage 2
-  punpckldq m5, m0, m10
-  punpckldq m9, m2, m1
-  punpckhdq m0, m10
-  punpckhdq m2, m1
-  punpckldq m10, m7, m4
-  punpckldq m1, m6, m3
-  punpckhdq m7, m4
-  punpckhdq m6, m3
-
-  ; stage 3
-  punpcklqdq m4, m5, m10
-  punpckhqdq m5, m10
-  punpcklqdq m3, m0, m7
-  punpckhqdq m0, m7
-  punpcklqdq m10, m9, m1
-  punpckhqdq m9, m1
-  punpcklqdq m7, m2, m6
-  punpckhqdq m2, m6
-
-  psraw m1, m4, 15
-  psraw m6, m5, 15
-  psraw m8, m3, 15
-  psraw m11, m0, 15
-
-  psubw m4, m1
-  psubw m5, m6
-  psubw m3, m8
-  psubw m0, m11
-
-  psraw m4, 1
-  psraw m5, 1
-  psraw m3, 1
-  psraw m0, 1
-
-  psraw m1, m10, 15
-  psraw m6, m9, 15
-  psraw m8, m7, 15
-  psraw m11, m2, 15
-
-  psubw m10, m1
-  psubw m9, m6
-  psubw m7, m8
-  psubw m2, m11
-
-  psraw m10, 1
-  psraw m9, 1
-  psraw m7, 1
-  psraw m2, 1
-
-  STORE_OUTPUT  0,  4
-  STORE_OUTPUT  8,  5
-  STORE_OUTPUT 16,  3
-  STORE_OUTPUT 24,  0
-  STORE_OUTPUT 32, 10
-  STORE_OUTPUT 40,  9
-  STORE_OUTPUT 48,  7
-  STORE_OUTPUT 56,  2
-
-  RET
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
deleted file mode 100644
index 099fcf7fc..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ /dev/null
@@ -1,998 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-#include <string.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/synonyms.h"
-
-// -----------------------------------------------------------------------------
-// Copy and average
-
-void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
-                                   int width, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
-      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-      p1 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storeu_si128((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storeu_si128((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storel_epi64((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  }
-}
-
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
-                                   uint16_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_q4;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  __m256i s[8], coeffs_y[4];
-
-  const int bits = FILTER_BITS;
-
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
-  const __m256i clip_pixel =
-      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m256i zero = _mm256_setzero_si256();
-
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
-  for (j = 0; j < w; j += 8) {
-    const uint16_t *data = &src_ptr[j];
-    /* Vertical filter */
-    {
-      __m256i src6;
-      __m256i s01 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-          0x20);
-      __m256i s12 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-          0x20);
-      __m256i s23 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-          0x20);
-      __m256i s34 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-          0x20);
-      __m256i s45 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-          0x20);
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-      __m256i s56 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-          src6, 0x20);
-
-      s[0] = _mm256_unpacklo_epi16(s01, s12);
-      s[1] = _mm256_unpacklo_epi16(s23, s34);
-      s[2] = _mm256_unpacklo_epi16(s45, s56);
-
-      s[4] = _mm256_unpackhi_epi16(s01, s12);
-      s[5] = _mm256_unpackhi_epi16(s23, s34);
-      s[6] = _mm256_unpackhi_epi16(s45, s56);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-
-        const __m256i s67 = _mm256_permute2x128_si256(
-            src6,
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            0x20);
-
-        src6 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-
-        const __m256i s78 = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            src6, 0x20);
-
-        s[3] = _mm256_unpacklo_epi16(s67, s78);
-        s[7] = _mm256_unpackhi_epi16(s67, s78);
-
-        const __m256i res_a = convolve(s, coeffs_y);
-
-        __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
-
-        if (w - j > 4) {
-          const __m256i res_b = convolve(s + 4, coeffs_y);
-          __m256i res_b_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
-
-          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
-          res_16bit = _mm256_max_epi16(res_16bit, zero);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
-                           _mm256_castsi256_si128(res_16bit));
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           _mm256_extracti128_si256(res_16bit, 1));
-        } else if (w == 4) {
-          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
-          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
-          res_a_round = _mm256_max_epi16(res_a_round, zero);
-
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
-                           _mm256_castsi256_si128(res_a_round));
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           _mm256_extracti128_si256(res_a_round, 1));
-        } else {
-          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
-          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
-          res_a_round = _mm256_max_epi16(res_a_round, zero);
-
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
-                       _mm256_castsi256_si128(res_a_round));
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                       _mm256_extracti128_si256(res_a_round, 1));
-        }
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
-                                   uint16_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
-  (void)filter_params_y;
-
-  // Check that, even with 12-bit input, the intermediate values will fit
-  // into an unsigned 16-bit intermediate array.
-  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
-  __m256i s[4], coeffs_x[4];
-
-  const __m256i round_const_x =
-      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
-  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
-  const __m256i clip_pixel =
-      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m256i zero = _mm256_setzero_si256();
-
-  assert(bits >= 0);
-  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
-         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
-
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    for (i = 0; i < h; i += 2) {
-      const __m256i row0 =
-          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
-      __m256i row1 =
-          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
-
-      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
-      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
-
-      // even pixels
-      s[0] = _mm256_alignr_epi8(r1, r0, 0);
-      s[1] = _mm256_alignr_epi8(r1, r0, 4);
-      s[2] = _mm256_alignr_epi8(r1, r0, 8);
-      s[3] = _mm256_alignr_epi8(r1, r0, 12);
-
-      __m256i res_even = convolve(s, coeffs_x);
-      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
-                                  round_shift_x);
-
-      // odd pixels
-      s[0] = _mm256_alignr_epi8(r1, r0, 2);
-      s[1] = _mm256_alignr_epi8(r1, r0, 6);
-      s[2] = _mm256_alignr_epi8(r1, r0, 10);
-      s[3] = _mm256_alignr_epi8(r1, r0, 14);
-
-      __m256i res_odd = convolve(s, coeffs_x);
-      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
-                                 round_shift_x);
-
-      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
-                                  round_shift_bits);
-      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
-                                 round_shift_bits);
-
-      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
-      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
-
-      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
-      res = _mm256_min_epi16(res, clip_pixel);
-      res = _mm256_max_epi16(res, zero);
-
-      if (w - j > 4) {
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
-                         _mm256_castsi256_si128(res));
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         _mm256_extracti128_si256(res, 1));
-      } else if (w == 4) {
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
-                         _mm256_castsi256_si128(res));
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         _mm256_extracti128_si256(res, 1));
-      } else {
-        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
-                     _mm256_castsi256_si128(res));
-        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                     _mm256_extracti128_si256(res, 1));
-      }
-    }
-  }
-}
-
-#define CONV8_ROUNDING_BITS (7)
-
-// -----------------------------------------------------------------------------
-// Horizontal and vertical filtering
-
-static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
-                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
-                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
-
-static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
-                                              8, 9, 10, 11, 10, 11, 12, 13,
-                                              4, 5, 6,  7,  6,  7,  8,  9,
-                                              8, 9, 10, 11, 10, 11, 12, 13 };
-
-static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
-                                              10, 11, 12, 13, 12, 13, 14, 15,
-                                              6,  7,  8,  9,  8,  9,  10, 11,
-                                              10, 11, 12, 13, 12, 13, 14, 15 };
-
-static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
-
-// -----------------------------------------------------------------------------
-// Horizontal Filtering
-
-static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
-  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
-  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
-  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
-  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
-
-  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
-  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
-  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
-  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
-}
-
-// Note:
-//  Shared by 8x2 and 16x1 block
-static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
-                                  __m256i *x /*x[8]*/) {
-  __m256i pp[8];
-  pack_pixels(s0, pp);
-  pack_pixels(s1, &pp[4]);
-  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
-  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
-  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
-  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
-  x[4] = x[2];
-  x[5] = x[3];
-  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
-  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
-}
-
-static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
-  __m256i pp[8];
-  __m256i s0;
-  s0 = _mm256_loadu_si256((const __m256i *)src);
-  pack_pixels(&s0, pp);
-  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
-  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
-  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
-  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
-}
-
-static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
-                                   __m256i *x) {
-  __m256i s0, s1;
-  s0 = _mm256_loadu_si256((const __m256i *)src);
-  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
-  pack_16_pixels(&s0, &s1, x);
-}
-
-static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
-  __m256i s0, s1;
-  s0 = _mm256_loadu_si256((const __m256i *)src);
-  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
-  pack_16_pixels(&s0, &s1, x);
-}
-
-// Note:
-//  Shared by horizontal and vertical filtering
-static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
-  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
-  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
-  const __m256i p0 = _mm256_set1_epi32(0x03020100);
-  const __m256i p1 = _mm256_set1_epi32(0x07060504);
-  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
-  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
-  f[0] = _mm256_shuffle_epi8(hh, p0);
-  f[1] = _mm256_shuffle_epi8(hh, p1);
-  f[2] = _mm256_shuffle_epi8(hh, p2);
-  f[3] = _mm256_shuffle_epi8(hh, p3);
-}
-
-static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
-                                     const __m256i *fil /*fil[4]*/,
-                                     __m256i *y) {
-  __m256i a, a0, a1;
-
-  a0 = _mm256_madd_epi16(fil[0], sig[0]);
-  a1 = _mm256_madd_epi16(fil[3], sig[3]);
-  a = _mm256_add_epi32(a0, a1);
-
-  a0 = _mm256_madd_epi16(fil[1], sig[1]);
-  a1 = _mm256_madd_epi16(fil[2], sig[2]);
-
-  {
-    const __m256i min = _mm256_min_epi32(a0, a1);
-    a = _mm256_add_epi32(a, min);
-  }
-  {
-    const __m256i max = _mm256_max_epi32(a0, a1);
-    a = _mm256_add_epi32(a, max);
-  }
-  {
-    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-    a = _mm256_add_epi32(a, rounding);
-    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
-  }
-}
-
-static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
-                                    uint16_t *dst) {
-  const __m128i a0 = _mm256_castsi256_si128(*y);
-  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
-  __m128i res = _mm_packus_epi32(a0, a1);
-  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
-                                    const __m256i *mask, uint16_t *dst,
-                                    ptrdiff_t pitch) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  a = _mm256_min_epi16(a, *mask);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
-  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
-}
-
-static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
-                                     const __m256i *mask, uint16_t *dst) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  a = _mm256_min_epi16(a, *mask);
-  _mm256_storeu_si256((__m256i *)dst, a);
-}
-
-static void aom_highbd_filter_block1d8_h8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_pixels(src_ptr, src_pitch, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    store_8x1_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// 2-tap horizontal filtering
-
-static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
-  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
-  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
-  const __m256i p = _mm256_set1_epi32(0x09080706);
-  f[0] = _mm256_shuffle_epi8(hh, p);
-}
-
-// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
-// the difference is s0/s1 specifies first and second rows or,
-// first 16 samples and 8-sample shifted 16 samples
-static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
-                                     __m256i *sig) {
-  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
-  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
-  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
-  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
-  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
-  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
-  r0 = _mm256_shuffle_epi8(r0, sf2);
-  r1 = _mm256_shuffle_epi8(r1, sf2);
-  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
-  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
-}
-
-static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
-                                      const ptrdiff_t pitch, __m256i *sig) {
-  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
-  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
-  pack_16_2t_pixels(&r0, &r1, sig);
-}
-
-static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
-                                       __m256i *sig /*sig[2]*/) {
-  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
-  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
-  pack_16_2t_pixels(&r0, &r1, sig);
-}
-
-static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
-                                      __m256i *sig /*sig[2]*/) {
-  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
-  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
-  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
-  r0 = _mm256_permutevar8x32_epi32(r0, idx);
-  r0 = _mm256_shuffle_epi8(r0, sf2);
-  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
-}
-
-// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
-static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
-                                       __m256i *y0, __m256i *y1) {
-  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
-  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
-  x0 = _mm256_add_epi32(x0, rounding);
-  x1 = _mm256_add_epi32(x1, rounding);
-  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
-  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
-}
-
-static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
-                                        __m256i *y0) {
-  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
-  x0 = _mm256_add_epi32(x0, rounding);
-  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
-}
-
-static void aom_highbd_filter_block1d8_h2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_2t_pixels(src_ptr, signal);
-    filter_8x1_2t_pixels(signal, &ff, &res0);
-    store_8x1_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_2t_pixels(src_ptr, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// Vertical Filtering
-
-static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
-  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
-  __m256i s1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
-  __m256i s2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
-  __m256i s3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
-  __m256i s4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
-  __m256i s5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
-  __m256i s6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
-
-  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
-  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
-  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
-  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
-  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
-  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
-
-  sig[0] = _mm256_unpacklo_epi16(s0, s1);
-  sig[4] = _mm256_unpackhi_epi16(s0, s1);
-  sig[1] = _mm256_unpacklo_epi16(s2, s3);
-  sig[5] = _mm256_unpackhi_epi16(s2, s3);
-  sig[2] = _mm256_unpacklo_epi16(s4, s5);
-  sig[6] = _mm256_unpackhi_epi16(s4, s5);
-  sig[8] = s6;
-}
-
-static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
-                                   __m256i *sig) {
-  // base + 7th row
-  __m256i s0 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
-  // base + 8th row
-  __m256i s1 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
-  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
-  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
-  sig[3] = _mm256_unpacklo_epi16(s2, s3);
-  sig[7] = _mm256_unpackhi_epi16(s2, s3);
-  sig[8] = s1;
-}
-
-static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
-                                     __m256i *y0, __m256i *y1) {
-  filter_8x1_pixels(sig, f, y0);
-  filter_8x1_pixels(&sig[4], f, y1);
-}
-
-static INLINE void update_pixels(__m256i *sig) {
-  int i;
-  for (i = 0; i < 3; ++i) {
-    sig[i] = sig[i + 1];
-    sig[i + 4] = sig[i + 5];
-  }
-}
-
-static void aom_highbd_filter_block1d8_v8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[9], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_8x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_8x9_pixels(src_ptr, src_pitch, signal);
-
-    filter_8x9_pixels(signal, ff, &res0, &res1);
-    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
-  __m256i u0, u1, u2, u3;
-  // load 0-6 rows
-  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
-  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
-  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
-  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
-  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
-  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
-  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
-
-  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
-  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
-
-  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
-  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
-
-  sig[0] = _mm256_unpacklo_epi16(u0, u2);
-  sig[4] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[8] = _mm256_unpacklo_epi16(u1, u3);
-  sig[12] = _mm256_unpackhi_epi16(u1, u3);
-
-  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
-  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
-
-  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
-  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
-
-  sig[1] = _mm256_unpacklo_epi16(u0, u2);
-  sig[5] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[9] = _mm256_unpacklo_epi16(u1, u3);
-  sig[13] = _mm256_unpackhi_epi16(u1, u3);
-
-  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
-  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
-
-  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
-  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
-
-  sig[2] = _mm256_unpacklo_epi16(u0, u2);
-  sig[6] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[10] = _mm256_unpacklo_epi16(u1, u3);
-  sig[14] = _mm256_unpackhi_epi16(u1, u3);
-
-  sig[16] = s6;
-}
-
-static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
-                             __m256i *sig) {
-  // base + 7th row
-  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
-  // base + 8th row
-  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
-
-  __m256i u0, u1, u2, u3;
-  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
-  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
-
-  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
-  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
-
-  sig[3] = _mm256_unpacklo_epi16(u0, u2);
-  sig[7] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[11] = _mm256_unpacklo_epi16(u1, u3);
-  sig[15] = _mm256_unpackhi_epi16(u1, u3);
-
-  sig[16] = s8;
-}
-
-static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
-                                      __m256i *y0, __m256i *y1) {
-  __m256i res[4];
-  int i;
-  for (i = 0; i < 4; ++i) {
-    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
-  }
-
-  {
-    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
-    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
-    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
-    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
-  }
-}
-
-static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
-                                     const __m256i *mask, uint16_t *dst,
-                                     ptrdiff_t pitch) {
-  __m256i p = _mm256_min_epi16(*y0, *mask);
-  _mm256_storeu_si256((__m256i *)dst, p);
-  p = _mm256_min_epi16(*y1, *mask);
-  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
-}
-
-static void update_16x9_pixels(__m256i *sig) {
-  update_pixels(&sig[0]);
-  update_pixels(&sig[8]);
-}
-
-static void aom_highbd_filter_block1d16_v8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[17], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_16x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_16x9_pixels(src_ptr, src_pitch, signal);
-    filter_16x9_pixels(signal, ff, &res0, &res1);
-    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_16x9_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// 2-tap vertical filtering
-
-static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
-  sig[2] = _mm256_loadu_si256((const __m256i *)src);
-}
-
-static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
-                                       __m256i *sig) {
-  // load the next row
-  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
-  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
-  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
-  sig[2] = u;
-}
-
-static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
-                                         __m256i *y0, __m256i *y1) {
-  filter_16_2t_pixels(sig, f, y0, y1);
-}
-
-static void aom_highbd_filter_block1d16_v2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[3], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-  __m256i ff;
-
-  pack_2t_filter(filter, &ff);
-  pack_16x2_init(src_ptr, signal);
-
-  do {
-    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
-  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
-  const __m128i p = _mm_set1_epi32(0x09080706);
-  f[0] = _mm_shuffle_epi8(h, p);
-}
-
-static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
-  sig[2] = _mm_loadu_si128((const __m128i *)src);
-}
-
-static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
-                                          __m128i *sig) {
-  // load the next row
-  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
-  sig[0] = _mm_unpacklo_epi16(sig[2], u);
-  sig[1] = _mm_unpackhi_epi16(sig[2], u);
-  sig[2] = u;
-}
-
-static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
-                                      __m128i *y0, __m128i *y1) {
-  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-  __m128i x0 = _mm_madd_epi16(sig[0], *f);
-  __m128i x1 = _mm_madd_epi16(sig[1], *f);
-  x0 = _mm_add_epi32(x0, rounding);
-  x1 = _mm_add_epi32(x1, rounding);
-  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
-  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
-}
-
-static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
-                                           const __m128i *mask, uint16_t *dst) {
-  __m128i res = _mm_packus_epi32(*y0, *y1);
-  res = _mm_min_epi16(res, *mask);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static void aom_highbd_filter_block1d8_v2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m128i signal[3], res0, res1;
-  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
-  __m128i ff;
-
-  pack_8x1_2t_filter(filter, &ff);
-  pack_8x2_init(src_ptr, signal);
-
-  do {
-    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
-    filter_8_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
-#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
-#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
-#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
-
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
deleted file mode 100644
index e7b33d1c4..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve_sse2.h"
-
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
-                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams *filter_params_x,
-                                    const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
-                                    ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_q4;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  __m128i s[16], coeffs_y[4];
-
-  const int bits = FILTER_BITS;
-
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i clip_pixel =
-      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m128i zero = _mm_setzero_si128();
-
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
-  for (j = 0; j < w; j += 8) {
-    const uint16_t *data = &src_ptr[j];
-    /* Vertical filter */
-    {
-      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
-      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
-      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
-
-      s[0] = _mm_unpacklo_epi16(s0, s1);
-      s[1] = _mm_unpacklo_epi16(s2, s3);
-      s[2] = _mm_unpacklo_epi16(s4, s5);
-
-      s[4] = _mm_unpackhi_epi16(s0, s1);
-      s[5] = _mm_unpackhi_epi16(s2, s3);
-      s[6] = _mm_unpackhi_epi16(s4, s5);
-
-      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
-      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
-      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
-
-      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
-      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
-      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-
-        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
-        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
-
-        s[3] = _mm_unpacklo_epi16(s6, s7);
-        s[7] = _mm_unpackhi_epi16(s6, s7);
-
-        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
-        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
-
-        const __m128i res_a0 = convolve(s, coeffs_y);
-        __m128i res_a_round0 = _mm_sra_epi32(
-            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
-
-        const __m128i res_a1 = convolve(s + 8, coeffs_y);
-        __m128i res_a_round1 = _mm_sra_epi32(
-            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
-
-        if (w - j > 4) {
-          const __m128i res_b0 = convolve(s + 4, coeffs_y);
-          __m128i res_b_round0 = _mm_sra_epi32(
-              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
-
-          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
-          __m128i res_b_round1 = _mm_sra_epi32(
-              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
-
-          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
-          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
-          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
-
-          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
-          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
-          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_16bit1);
-        } else if (w == 4) {
-          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
-          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
-          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
-          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
-          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
-          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_a_round1);
-        } else {
-          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
-          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
-          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
-          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
-          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
-          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
-          *((uint32_t *)(&dst[i * dst_stride + j])) =
-              _mm_cvtsi128_si32(res_a_round0);
-
-          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
-              _mm_cvtsi128_si32(res_a_round1);
-        }
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-
-        s[0 + 8] = s[1 + 8];
-        s[1 + 8] = s[2 + 8];
-        s[2 + 8] = s[3 + 8];
-
-        s[4 + 8] = s[5 + 8];
-        s[5 + 8] = s[6 + 8];
-        s[6 + 8] = s[7 + 8];
-
-        s6 = s8;
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
-                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams *filter_params_x,
-                                    const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
-                                    ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
-  (void)filter_params_y;
-
-  // Check that, even with 12-bit input, the intermediate values will fit
-  // into an unsigned 16-bit intermediate array.
-  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
-  __m128i s[4], coeffs_x[4];
-
-  const __m128i round_const_x =
-      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
-  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
-  const int bits = FILTER_BITS - conv_params->round_0;
-
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i clip_pixel =
-      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m128i zero = _mm_setzero_si128();
-
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
-      for (i = 0; i < h; i += 1) {
-        const __m128i row00 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i row01 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
-
-        // even pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 0);
-        s[1] = _mm_alignr_epi8(row01, row00, 4);
-        s[2] = _mm_alignr_epi8(row01, row00, 8);
-        s[3] = _mm_alignr_epi8(row01, row00, 12);
-
-        __m128i res_even = convolve(s, coeffs_x);
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
-                                 round_shift_x);
-
-        // odd pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 2);
-        s[1] = _mm_alignr_epi8(row01, row00, 6);
-        s[2] = _mm_alignr_epi8(row01, row00, 10);
-        s[3] = _mm_alignr_epi8(row01, row00, 14);
-
-        __m128i res_odd = convolve(s, coeffs_x);
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
-
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
-                                 round_shift_bits);
-        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
-                                round_shift_bits);
-
-        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
-        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
-        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
-
-        res = _mm_min_epi16(res, clip_pixel);
-        res = _mm_max_epi16(res, zero);
-
-        if (w - j > 4) {
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-        } else if (w == 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
-        } else {
-          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
deleted file mode 100644
index 5a55736c4..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ /dev/null
@@ -1,984 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  (void)above;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
-  dst += stride << 2;
-  left += 4;
-  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
-}
-
-void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  (void)above;
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
-}
-
-void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
-  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
-  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
-  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
-  (void)above;
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
-}
-
-void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
-  dst += stride << 3;
-  left += 8;
-  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
-}
-
-static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpacklo_epi64(*row, *row);
-  _mm_store_si128((__m128i *)*dst, val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  *dst += stride;
-}
-
-static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpackhi_epi64(*row, *row);
-  _mm_store_si128((__m128i *)(*dst), val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  *dst += stride;
-}
-
-static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *left) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
-  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
-  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
-  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
-  h_store_16_unpacklo(&dst, stride, &row0);
-  h_store_16_unpacklo(&dst, stride, &row1);
-  h_store_16_unpacklo(&dst, stride, &row2);
-  h_store_16_unpacklo(&dst, stride, &row3);
-  h_store_16_unpackhi(&dst, stride, &row4);
-  h_store_16_unpackhi(&dst, stride, &row5);
-  h_store_16_unpackhi(&dst, stride, &row6);
-  h_store_16_unpackhi(&dst, stride, &row7);
-}
-
-void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)above;
-  (void)bd;
-  h_predictor_16x8(dst, stride, left);
-}
-
-void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 2; i++, left += 8) {
-    h_predictor_16x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 4; i++, left += 8) {
-    h_predictor_16x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpacklo_epi64(*row, *row);
-  _mm_store_si128((__m128i *)(*dst), val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  _mm_store_si128((__m128i *)(*dst + 16), val);
-  _mm_store_si128((__m128i *)(*dst + 24), val);
-  *dst += stride;
-}
-
-static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpackhi_epi64(*row, *row);
-  _mm_store_si128((__m128i *)(*dst), val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  _mm_store_si128((__m128i *)(*dst + 16), val);
-  _mm_store_si128((__m128i *)(*dst + 24), val);
-  *dst += stride;
-}
-
-static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *left) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
-  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
-  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
-  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
-  h_store_32_unpacklo(&dst, stride, &row0);
-  h_store_32_unpacklo(&dst, stride, &row1);
-  h_store_32_unpacklo(&dst, stride, &row2);
-  h_store_32_unpacklo(&dst, stride, &row3);
-  h_store_32_unpackhi(&dst, stride, &row4);
-  h_store_32_unpackhi(&dst, stride, &row5);
-  h_store_32_unpackhi(&dst, stride, &row6);
-  h_store_32_unpackhi(&dst, stride, &row7);
-}
-
-void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 2; i++, left += 8) {
-    h_predictor_32x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 4; i++, left += 8) {
-    h_predictor_32x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP, DC_LEFT, DC_128
-
-// 4x4
-
-static INLINE __m128i dc_sum_4(const uint16_t *ref) {
-  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
-  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
-  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
-  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
-}
-
-static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
-                                const __m128i *dc) {
-  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
-  int i;
-  for (i = 0; i < 4; ++i, dst += stride) {
-    _mm_storel_epi64((__m128i *)dst, dc_dup);
-  }
-}
-
-void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)above;
-  (void)bd;
-  dc_store_4x4(dst, stride, &dc);
-}
-
-void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)left;
-  (void)bd;
-  dc_store_4x4(dst, stride, &dc);
-}
-
-void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_4x4(dst, stride, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 4x8
-
-static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
-                                const __m128i *dc) {
-  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
-  int i;
-  for (i = 0; i < 8; ++i, dst += stride) {
-    _mm_storel_epi64((__m128i *)dst, dc_dup);
-  }
-}
-
-// Shared with DC 8xh
-static INLINE __m128i dc_sum_8(const uint16_t *ref) {
-  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
-  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
-  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
-  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
-
-  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
-}
-
-void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i sum = dc_sum_8(left);
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  (void)above;
-  (void)bd;
-  dc_store_4x8(dst, stride, &dc);
-}
-
-void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)left;
-  (void)bd;
-  dc_store_4x8(dst, stride, &dc);
-}
-
-void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_4x8(dst, stride, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 8xh
-
-static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
-                                const __m128i *dc) {
-  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
-  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
-  int i;
-  for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
-                                        int height, const uint16_t *above) {
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i sum = dc_sum_8(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  dc_store_8xh(dst, stride, height, &dc);
-}
-
-void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  dc_top_predictor_8xh(dst, stride, 4, above);
-}
-
-void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  dc_top_predictor_8xh(dst, stride, 8, above);
-}
-
-void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  dc_top_predictor_8xh(dst, stride, 16, above);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)above;
-  (void)bd;
-  dc_store_8xh(dst, stride, 4, &dc);
-}
-
-void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i sum = dc_sum_8(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  (void)above;
-  (void)bd;
-  dc_store_8xh(dst, stride, 8, &dc);
-}
-
-// Shared with DC 16xh
-static INLINE __m128i dc_sum_16(const uint16_t *ref) {
-  const __m128i sum_lo = dc_sum_8(ref);
-  const __m128i sum_hi = dc_sum_8(ref + 8);
-  return _mm_add_epi16(sum_lo, sum_hi);
-}
-
-void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)above;
-  (void)bd;
-  dc_store_8xh(dst, stride, 16, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
-                                        int height, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  dc_store_8xh(dst, stride, height, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)above;
-  (void)left;
-  dc_128_predictor_8xh(dst, stride, 4, bd);
-}
-
-void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)above;
-  (void)left;
-  dc_128_predictor_8xh(dst, stride, 8, bd);
-}
-
-void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  (void)above;
-  (void)left;
-  dc_128_predictor_8xh(dst, stride, 16, bd);
-}
-
-// -----------------------------------------------------------------------------
-// 16xh
-
-static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
-                                 const __m128i *dc) {
-  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
-  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
-  int i;
-  for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
-    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i sum = dc_sum_8(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  (void)above;
-  (void)bd;
-  dc_store_16xh(dst, stride, 8, &dc);
-}
-
-void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)above;
-  (void)bd;
-  dc_store_16xh(dst, stride, 16, &dc);
-}
-
-// Shared with 32xh
-static INLINE __m128i dc_sum_32(const uint16_t *ref) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sum_a = dc_sum_16(ref);
-  const __m128i sum_b = dc_sum_16(ref + 16);
-  // 12 bit bd will outrange, so expand to 32 bit before adding final total
-  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
-                       _mm_unpacklo_epi16(sum_b, zero));
-}
-
-void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(left);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)above;
-  (void)bd;
-  dc_store_16xh(dst, stride, 32, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)left;
-  (void)bd;
-  dc_store_16xh(dst, stride, 8, &dc);
-}
-
-void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)left;
-  (void)bd;
-  dc_store_16xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)left;
-  (void)bd;
-  dc_store_16xh(dst, stride, 32, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_16xh(dst, stride, 8, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_16xh(dst, stride, 16, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_16xh(dst, stride, 32, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 32xh
-
-static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
-                                 const __m128i *dc) {
-  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
-  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
-  int i;
-  for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
-    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
-    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
-    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
-  }
-}
-
-void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)above;
-  (void)bd;
-  dc_store_32xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(left);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)above;
-  (void)bd;
-  dc_store_32xh(dst, stride, 32, &dc);
-}
-
-void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(above);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)left;
-  (void)bd;
-  dc_store_32xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_32xh(dst, stride, 16, &dc_dup);
-}
-
-void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(above);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)left;
-  (void)bd;
-  dc_store_32xh(dst, stride, 32, &dc);
-}
-
-void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_32xh(dst, stride, 32, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
-  int i;
-  for (i = 0; i < 2; ++i) {
-    _mm_storel_epi64((__m128i *)dst, above_u16);
-    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
-    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
-    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
-    dst += stride << 2;
-  }
-}
-
-void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
-  _mm_store_si128((__m128i *)dst, above_u16);
-  _mm_store_si128((__m128i *)(dst + stride), above_u16);
-  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
-  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
-}
-
-void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, above_u16);
-    _mm_store_si128((__m128i *)(dst + stride), above_u16);
-    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
-    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
-    dst += stride << 2;
-  }
-}
-
-void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
-  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
-  int i;
-  for (i = 0; i < 2; ++i) {
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-  }
-}
-
-void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
-  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
-  int i;
-  for (i = 0; i < 8; ++i) {
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-  }
-}
-
-void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
-  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)bd;
-  const __m128i sum_above = dc_sum_4(above);
-  const __m128i sum_left = dc_sum_8(left);
-  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 >>= 16;
-  sum32 += 6;
-  sum32 /= 12;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_storel_epi64((__m128i *)dst, row);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)bd;
-  const __m128i sum_left = dc_sum_4(left);
-  const __m128i sum_above = dc_sum_8(above);
-  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 >>= 16;
-  sum32 += 6;
-  sum32 /= 12;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-
-  _mm_store_si128((__m128i *)dst, row);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
-}
-
-void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_left = _mm_unpacklo_epi16(sum_left, zero);
-  sum_above = _mm_unpacklo_epi16(sum_above, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 12;
-  sum32 /= 24;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_left = _mm_unpacklo_epi16(sum_left, zero);
-  sum_above = _mm_unpacklo_epi16(sum_above, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 12;
-  sum32 /= 24;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 2; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_above = _mm_unpacklo_epi16(sum_above, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 24;
-  sum32 /= 48;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 8; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_left = _mm_unpacklo_epi16(sum_left, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 24;
-  sum32 /= 48;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
deleted file mode 100644
index 91b3d126c..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 4 dd 16
-pw_32: times 4 dd 32
-
-SECTION .text
-INIT_XMM sse2
-cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  paddw                 m0, m2
-  pshuflw               m1, m0, 0xe
-  paddw                 m0, m1
-  pshuflw               m1, m0, 0x1
-  paddw                 m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, one
-  mov                 oned, 0x00010001
-  lea             stride3q, [strideq*3]
-  movd                  m3, oned
-  pshufd                m3, m3, 0x0
-  paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
-  paddw                 m0, [GLOBAL(pw_8)]
-  psrlw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-  lea                 dstq, [dstq+strideq*8]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m3, [aboveq+16]
-  mova                  m2, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  punpcklwd             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  punpckldq             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  paddd                 m0, [GLOBAL(pw_16)]
-  psrad                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-.loop:
-  mova   [dstq              ], m0
-  mova   [dstq           +16], m0
-  mova   [dstq+strideq*2    ], m0
-  mova   [dstq+strideq*2 +16], m0
-  mova   [dstq+strideq*4    ], m0
-  mova   [dstq+strideq*4 +16], m0
-  mova   [dstq+stride3q*2   ], m0
-  mova   [dstq+stride3q*2+16], m0
-  lea                 dstq, [dstq+strideq*8]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [aboveq+32]
-  mova                  m4, [aboveq+48]
-  paddw                 m0, m2
-  paddw                 m3, m4
-  mova                  m2, [leftq]
-  mova                  m4, [leftq+16]
-  mova                  m5, [leftq+32]
-  mova                  m6, [leftq+48]
-  paddw                 m2, m4
-  paddw                 m5, m6
-  paddw                 m0, m3
-  paddw                 m2, m5
-  pxor                  m1, m1
-  paddw                 m0, m2
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  movhlps               m2, m0
-  paddw                 m0, m2
-  punpcklwd             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  punpckldq             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  paddd                 m0, [GLOBAL(pw_32)]
-  psrad                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-.loop:
-  mova [dstq               ], m0
-  mova [dstq          +16  ], m0
-  mova [dstq          +32  ], m0
-  mova [dstq          +48  ], m0
-  mova [dstq+strideq*2     ], m0
-  mova [dstq+strideq*2+16  ], m0
-  mova [dstq+strideq*2+32  ], m0
-  mova [dstq+strideq*2+48  ], m0
-  mova [dstq+strideq*4     ], m0
-  mova [dstq+strideq*4+16  ], m0
-  mova [dstq+strideq*4+32  ], m0
-  mova [dstq+strideq*4+48  ], m0
-  mova [dstq+stride3q*2    ], m0
-  mova [dstq+stride3q*2 +16], m0
-  mova [dstq+stride3q*2 +32], m0
-  mova [dstq+stride3q*2 +48], m0
-  lea                 dstq, [dstq+strideq*8]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-  RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-  lea                 dstq, [dstq+strideq*8]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-  RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq              ], m0
-  mova    [dstq           +16], m1
-  mova    [dstq+strideq*2    ], m0
-  mova    [dstq+strideq*2 +16], m1
-  mova    [dstq+strideq*4    ], m0
-  mova    [dstq+strideq*4 +16], m1
-  mova    [dstq+stride3q*2   ], m0
-  mova    [dstq+stride3q*2+16], m1
-  lea                 dstq, [dstq+strideq*8]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  mova                  m2, [aboveq+32]
-  mova                  m3, [aboveq+48]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq            +32], m2
-  mova [dstq            +48], m3
-  mova [dstq+strideq*2     ], m0
-  mova [dstq+strideq*2  +16], m1
-  mova [dstq+strideq*2  +32], m2
-  mova [dstq+strideq*2  +48], m3
-  mova [dstq+strideq*4     ], m0
-  mova [dstq+strideq*4  +16], m1
-  mova [dstq+strideq*4  +32], m2
-  mova [dstq+strideq*4  +48], m3
-  mova [dstq+stride3q*2    ], m0
-  mova [dstq+stride3q*2 +16], m1
-  mova [dstq+stride3q*2 +32], m2
-  mova [dstq+stride3q*2 +48], m3
-  lea                 dstq, [dstq+strideq*8]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
deleted file mode 100644
index c954da94e..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/common_avx2.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom/aom_integer.h"
-
-void aom_highbd_lpf_horizontal_14_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
-                                         blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                       limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                        limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                        limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                      limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                      limit1, thresh1, bd);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
deleted file mode 100644
index 097e0778f..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ /dev/null
@@ -1,1697 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/lpf_common_sse2.h"
-
-static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
-                                         __m128i *pixel) {
-  *pixel = _mm_min_epi16(*pixel, *max);
-  *pixel = _mm_max_epi16(*pixel, *min);
-}
-
-static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-}
-
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
-                             const uint8_t *t, int bd, __m128i *blt,
-                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
-  *blt = _mm_slli_epi16(x, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
-  *lt = _mm_slli_epi16(x, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
-  *thr = _mm_slli_epi16(x, shift);
-
-  *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void get_limit_dual(
-    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
-    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
-    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
-    __m128i *t80_out) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
-  __m128i x1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *blt_out = _mm_slli_epi16(x0, shift);
-
-  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
-  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *lt_out = _mm_slli_epi16(x0, shift);
-
-  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
-  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *thr_out = _mm_slli_epi16(x0, shift);
-
-  *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
-                                     __m128i *p, __m128i *q) {
-  int i;
-  for (i = 0; i < size; i++) {
-    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
-    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
-  }
-}
-
-static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
-                                           const __m128i *l, const __m128i *bl,
-                                           __m128i *mask) {
-  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
-  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-
-  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
-  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
-  int i;
-  for (i = 1; i < 4; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
-    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
-  }
-  max = _mm_subs_epu16(max, *l);
-  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
-}
-
-static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
-                                                 __m128i *p1p0, __m128i *q1q0,
-                                                 __m128i *abs_p1p0, __m128i *l,
-                                                 __m128i *bl, __m128i *t,
-                                                 __m128i *hev, __m128i *mask) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
-  __m128i max, max01, h;
-
-  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
-  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
-
-  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
-  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
-
-  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
-  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
-  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
-  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
-  // mask |= (abs(*p1 - *p0) > limit) * -1;
-  // mask |= (abs(*q1 - *q0) > limit) * -1;
-  h = _mm_subs_epu16(max01, *t);
-
-  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-  // replicate for the further "merged variables" usage
-  *hev = _mm_unpacklo_epi64(*hev, *hev);
-
-  max = _mm_max_epi16(max, max01);
-  int i;
-  for (i = 2; i < x; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
-  }
-  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
-  max = _mm_subs_epu16(max, *l);
-  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
-}
-
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
-                                      int start, int end, __m128i *flat) {
-  int i;
-  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
-                              abs_diff16(pq[start + 1], pq[0]));
-
-  for (i = start + 2; i < end; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
-  }
-  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
-  __m128i ft;
-  ft = _mm_subs_epu16(max, *th);
-
-  const __m128i zero = _mm_setzero_si128();
-  *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
-                                           const __m128i *q, int start, int end,
-                                           __m128i *flat) {
-  int i;
-  __m128i max =
-      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
-
-  for (i = start + 1; i < end; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
-    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
-  }
-
-  __m128i ft;
-  ft = _mm_subs_epu16(max, *th);
-
-  const __m128i zero = _mm_setzero_si128();
-  *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
-                                          __m128i *flat2, int bd) {
-  // check the distance 1,2,3 against 0
-  __m128i th = _mm_set1_epi16(1);
-  th = _mm_slli_epi16(th, bd - 8);
-  flat_mask_internal(&th, pq, 1, 4, flat);
-  flat_mask_internal(&th, pq, 4, 7, flat2);
-}
-
-static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
-                                               const __m128i *q, __m128i *flat,
-                                               __m128i *flat2, int bd) {
-  // check the distance 1,2,3 against 0
-  __m128i th = _mm_set1_epi16(1);
-  th = _mm_slli_epi16(th, bd - 8);
-  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
-  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
-}
-
-static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
-                                                 __m128i *hev, __m128i *mask,
-                                                 __m128i *qs1qs0,
-                                                 __m128i *ps1ps0, __m128i *t80,
-                                                 int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-  const __m128i pmin = _mm_subs_epi16(zero, *t80);
-
-  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
-  __m128i ps1ps0_work, qs1qs0_work, work;
-  __m128i filt, filter2filter1, filter2filt, filter1filt;
-
-  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
-  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
-
-  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
-  pixel_clamp(&pmin, &pmax, &work);
-  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-
-  filt = _mm_subs_epi16(filt, work);
-  filt = _mm_subs_epi16(filt, work);
-  filt = _mm_subs_epi16(filt, work);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, *mask);
-  filt = _mm_unpacklo_epi64(filt, filt);
-
-  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
-  pixel_clamp(&pmin, &pmax, &filter2filter1);
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
-
-  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
-
-  // filt >> 1
-  filt = _mm_adds_epi16(filt, one);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(*hev, filt);
-
-  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
-  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
-
-  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
-  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
-
-  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
-  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
-
-  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
-  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
-}
-
-static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
-                                            __m128i *qs, const __m128i *mask,
-                                            const __m128i *th, int bd,
-                                            __m128i *t80) {
-  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
-  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
-  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
-  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i pmin = _mm_subs_epi16(zero, *t80);
-  __m128i filter = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filter);
-
-  // hev_filter
-  __m128i hev;
-  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
-  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
-  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm_subs_epu16(h, *th);
-  const __m128i ffff = _mm_cmpeq_epi16(h, h);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-
-  filter = _mm_and_si128(filter, hev);
-
-  const __m128i x = _mm_subs_epi16(qs0, ps0);
-  filter = _mm_adds_epi16(filter, x);
-  filter = _mm_adds_epi16(filter, x);
-  filter = _mm_adds_epi16(filter, x);
-  pixel_clamp(&pmin, &pmax, &filter);
-  filter = _mm_and_si128(filter, *mask);
-  const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t4 = _mm_set1_epi16(4);
-  __m128i filter1 = _mm_adds_epi16(filter, t4);
-  __m128i filter2 = _mm_adds_epi16(filter, t3);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter1 = _mm_srai_epi16(filter1, 3);
-  filter2 = _mm_srai_epi16(filter2, 3);
-  qs0 = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &qs0);
-  ps0 = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &ps0);
-  qs[0] = _mm_adds_epi16(qs0, *t80);
-  ps[0] = _mm_adds_epi16(ps0, *t80);
-  filter = _mm_adds_epi16(filter1, one);
-  filter = _mm_srai_epi16(filter, 1);
-  filter = _mm_andnot_si128(hev, filter);
-  qs1 = _mm_subs_epi16(qs1, filter);
-  pixel_clamp(&pmin, &pmax, &qs1);
-  ps1 = _mm_adds_epi16(ps1, filter);
-  pixel_clamp(&pmin, &pmax, &ps1);
-  qs[1] = _mm_adds_epi16(qs1, *t80);
-  ps[1] = _mm_adds_epi16(ps1, *t80);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
-    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
-    const unsigned char *lt, const unsigned char *thr, int bd) {
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit, limit, thresh;
-  __m128i t80;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
-
-  for (i = 0; i < 7; i++) {
-    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
-  }
-  __m128i mask, hevhev;
-  __m128i p1p0, q1q0, abs_p1p0;
-
-  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hevhev, &mask);
-
-  __m128i ps0ps1, qs0qs1;
-  // filter4
-  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
-
-  __m128i flat, flat2;
-  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
-
-  flat = _mm_and_si128(flat, mask);
-  flat2 = _mm_and_si128(flat2, flat);
-
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-  flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
-  // flat and wide flat calculations
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i flat_p[3], flat_q[3], flat_pq[3];
-    __m128i flat2_p[6], flat2_q[6];
-    __m128i flat2_pq[6];
-    __m128i sum_p6, sum_p3;
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-
-    __m128i work0, work0_0, work0_1, sum_p_0;
-    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
-    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
-
-    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
-    __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
-    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
-
-    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
-    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
-    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
-    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
-    work0_1 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
-
-    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-
-    work0 = _mm_add_epi16(sum_p3, pq[1]);
-    flat_p[1] = _mm_add_epi16(sum_lp, work0);
-    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
-    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
-    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
-
-    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
-    work0 = _mm_add_epi16(sum_p3, pq[2]);
-
-    flat_p[2] = _mm_add_epi16(sum_lp, work0);
-    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-
-    int flat2_mask =
-        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
-    if (flat2_mask) {
-      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
-      flat2_q[0] = _mm_add_epi16(
-          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
-
-      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
-      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
-      flat2_pq[0] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-      flat2_pq[1] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
-      sum_p = _mm_sub_epi16(sum_p, q[4]);
-      sum_q = _mm_sub_epi16(sum_q, pq[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
-      flat2_p[2] = _mm_add_epi16(sum_p, work0);
-      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[2] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[3]);
-      sum_q = _mm_sub_epi16(sum_q, pq[3]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
-      flat2_p[3] = _mm_add_epi16(sum_p, work0);
-      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[3] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[2]);
-      sum_q = _mm_sub_epi16(sum_q, pq[2]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
-      flat2_p[4] = _mm_add_epi16(sum_p, work0);
-      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[4] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[1]);
-      sum_q = _mm_sub_epi16(sum_q, pq[1]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
-      flat2_p[5] = _mm_add_epi16(sum_p, work0);
-      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[5] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-    }  // flat2
-       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // highbd_filter8
-    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
-    for (i = 0; i < 3; i++) {
-      pq[i] = _mm_andnot_si128(flat, pq[i]);
-      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
-      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
-    }
-
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    if (flat2_mask) {
-      for (i = 0; i < 6; i++) {
-        pq[i] = _mm_andnot_si128(flat2, pq[i]);
-        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
-        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
-      }
-    }
-  } else {
-    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
-                                       const uint8_t *blt, const uint8_t *lt,
-                                       const uint8_t *thr, int bd) {
-  __m128i p[7], q[7], pq[7];
-  int i;
-
-  for (i = 0; i < 7; i++) {
-    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
-    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
-  }
-
-  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
-
-  for (i = 0; i < 6; i++) {
-    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
-    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
-    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
-    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
-    const uint8_t *thr1, int bd) {
-  __m128i blimit, limit, thresh, t80;
-  const __m128i zero = _mm_setzero_si128();
-
-  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
-                 &t80);
-  __m128i mask;
-  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
-  __m128i flat, flat2;
-  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
-
-  flat = _mm_and_si128(flat, mask);
-  flat2 = _mm_and_si128(flat2, flat);
-  __m128i ps[2], qs[2];
-  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
-  // flat and wide flat calculations
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i flat_p[3], flat_q[3];
-    __m128i flat2_p[6], flat2_q[6];
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
-    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
-    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
-    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
-    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
-    sum_q = _mm_add_epi16(sum_q, sum_lq);
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-    flat_p[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
-    flat_q[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
-    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
-    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
-    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
-    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
-    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
-    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-    flat_p[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
-    flat_q[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
-    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-    flat_p[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
-    flat_q[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
-    int flat2_mask =
-        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
-    if (flat2_mask) {
-      flat2_p[0] = _mm_srli_epi16(
-          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
-                                               _mm_add_epi16(p[1], q[0]))),
-          4);
-      flat2_q[0] = _mm_srli_epi16(
-          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
-                                               _mm_add_epi16(p[0], q[1]))),
-          4);
-
-      flat2_p[1] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
-          4);
-      flat2_q[1] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[4]);
-      sum_q = _mm_sub_epi16(sum_q, p[4]);
-      flat2_p[2] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
-          4);
-      flat2_q[2] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[3]);
-      sum_q = _mm_sub_epi16(sum_q, p[3]);
-      flat2_p[3] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
-          4);
-      flat2_q[3] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[2]);
-      sum_q = _mm_sub_epi16(sum_q, p[2]);
-      flat2_p[4] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
-          4);
-      flat2_q[4] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[1]);
-      sum_q = _mm_sub_epi16(sum_q, p[1]);
-      flat2_p[5] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
-          4);
-      flat2_q[5] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
-          4);
-    }
-    // highbd_filter8
-    int i;
-    for (i = 0; i < 2; i++) {
-      ps[i] = _mm_andnot_si128(flat, ps[i]);
-      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-      p[i] = _mm_or_si128(ps[i], flat_p[i]);
-      qs[i] = _mm_andnot_si128(flat, qs[i]);
-      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-      q[i] = _mm_or_si128(qs[i], flat_q[i]);
-    }
-    p[2] = _mm_andnot_si128(flat, p[2]);
-    //  p2 remains unchanged if !(flat && mask)
-    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
-    //  when (flat && mask)
-    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
-    q[2] = _mm_andnot_si128(flat, q[2]);
-    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
-    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-
-    for (i = 0; i < 2; i++) {
-      ps[i] = _mm_andnot_si128(flat, ps[i]);
-      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-      p[i] = _mm_or_si128(ps[i], flat_p[i]);
-      qs[i] = _mm_andnot_si128(flat, qs[i]);
-      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-      q[i] = _mm_or_si128(qs[i], flat_q[i]);
-    }
-    // highbd_filter16
-    if (flat2_mask) {
-      for (i = 0; i < 6; i++) {
-        //  p[i] remains unchanged if !(flat2 && flat && mask)
-        p[i] = _mm_andnot_si128(flat2, p[i]);
-        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-        //  get values for when (flat2 && flat && mask)
-        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-        q[i] = _mm_andnot_si128(flat2, q[i]);
-        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-        q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      }
-    }
-  } else {
-    p[0] = ps[0];
-    q[0] = qs[0];
-    p[1] = ps[1];
-    q[1] = qs[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_dual_sse2(
-    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p[7], q[7];
-  int i;
-  load_highbd_pixel(s, 7, pitch, p, q);
-
-  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
-                                   _limit1, _thresh1, bd);
-
-  for (i = 0; i < 6; i++) {
-    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
-    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
-    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
-    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-  __m128i pq[3];
-  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
-  __m128i flat_p1p0, flat_q0q1;
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i t80;
-  const __m128i one = _mm_set1_epi16(0x1);
-
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
-  // flat_mask
-  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
-  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_c;
-    __m128i pq0x2_pq1, pq1_pq2;
-
-    // op1
-    pq0x2_pq1 =
-        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
-    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
-                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
-    workp_b =
-        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
-    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
-                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_srli_si128(pq1_pq2, 8);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
-                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(*q2, *q2);
-    workp_b =
-        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
-    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
-    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
-    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
-    const unsigned char *_thresh0, const unsigned char *_blimit1,
-    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit0, limit0, thresh0;
-  __m128i t80;
-  __m128i mask, flat, work;
-  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
-  __m128i op1, op0, oq0, oq1;
-  const __m128i four = _mm_set1_epi16(4);
-  const __m128i one = _mm_set1_epi16(0x1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  abs_p2p1 = abs_diff16(*p2, *p1);
-  abs_p1p0 = abs_diff16(*p1, *p0);
-  abs_q1q0 = abs_diff16(*q1, *q0);
-  abs_q2q1 = abs_diff16(*q2, *q1);
-
-  abs_p0q0 = abs_diff16(*p0, *q0);
-  abs_p1q1 = abs_diff16(*p1, *q1);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
-  mask = _mm_max_epi16(abs_q2q1, mask);
-  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  mask = _mm_max_epi16(work, mask);
-  mask = _mm_max_epi16(mask, abs_p2p1);
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  // flat_mask
-  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
-  flat = _mm_max_epi16(flat, work);
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-
-    // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
-                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
-    op1 = _mm_srli_epi16(workp_shft0, 3);
-
-    // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
-    workp_a =
-        _mm_add_epi16(workp_a,
-                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
-    op0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
-                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
-    workp_b = _mm_add_epi16(*q1, *q2);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
-    oq0 = _mm_srli_epi16(workp_shft0, 3);
-
-    // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
-                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
-    workp_b = _mm_add_epi16(*q2, *q2);
-    workp_shft1 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
-    oq1 = _mm_srli_epi16(workp_shft1, 3);
-
-    qs[0] = _mm_andnot_si128(flat, qs[0]);
-    oq0 = _mm_and_si128(flat, oq0);
-    *q0 = _mm_or_si128(qs[0], oq0);
-
-    qs[1] = _mm_andnot_si128(flat, qs[1]);
-    oq1 = _mm_and_si128(flat, oq1);
-    *q1 = _mm_or_si128(qs[1], oq1);
-
-    ps[0] = _mm_andnot_si128(flat, ps[0]);
-    op0 = _mm_and_si128(flat, op0);
-    *p0 = _mm_or_si128(ps[0], op0);
-
-    ps[1] = _mm_andnot_si128(flat, ps[1]);
-    op1 = _mm_and_si128(flat, op1);
-    *p1 = _mm_or_si128(ps[1], op1);
-  } else {
-    *q0 = qs[0];
-    *q1 = qs[1];
-    *p0 = ps[0];
-    *p1 = ps[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
-
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
-  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
-                             _blimit, _limit, _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
-}
-
-void aom_highbd_lpf_horizontal_6_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2;
-
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-
-  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
-                                  _limit0, _thresh0, _blimit1, _limit1,
-                                  _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    const unsigned char *_blimit, const unsigned char *_limit,
-    const unsigned char *_thresh, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-  __m128i pq[4];
-  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
-
-  __m128i abs_p1p0;
-
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i t80;
-  const __m128i one = _mm_set1_epi16(0x1);
-
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
-  // flat_mask4
-  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
-  flat = _mm_max_epi16(abs_p1p0, flat);
-  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
-    // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-    // o*p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    workp_c = _mm_add_epi16(workp_a, workp_c);
-
-    // o*p1
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
-    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
-    // o*p0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
-    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    workp_a = _mm_add_epi16(workp_a, workp_b);
-    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-
-    work_a = _mm_andnot_si128(flat, pq[2]);
-    *p2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_a, *p2);
-    *q2 = _mm_srli_si128(*p2, 8);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
-    const unsigned char *_limit0, const unsigned char *_thresh0,
-    const unsigned char *_blimit1, const unsigned char *_limit1,
-    const unsigned char *_thresh1, int bd) {
-  __m128i blimit0, limit0, thresh0;
-  __m128i t80;
-  __m128i mask, flat;
-  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
-  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i four = _mm_set1_epi16(4);
-  const __m128i one = _mm_set1_epi16(0x1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  abs_p0q0 = abs_diff16(*p0, *q0);
-  abs_p1q1 = abs_diff16(*p1, *q1);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
-
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
-  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
-  work1 =
-      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
-  work0 = _mm_max_epi16(work0, work1);
-  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
-  work2 = _mm_max_epi16(work2, work0);
-  mask = _mm_max_epi16(work2, mask);
-
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
-  flat = _mm_max_epi16(work1, flat);
-  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
-  flat = _mm_max_epi16(work0, flat);
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b;
-    // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-    // o*p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // o*p1
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
-    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // o*p0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
-    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
-    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
-    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    qs[0] = _mm_andnot_si128(flat, qs[0]);
-    oq0 = _mm_and_si128(flat, oq0);
-    *q0 = _mm_or_si128(qs[0], oq0);
-
-    qs[1] = _mm_andnot_si128(flat, qs[1]);
-    oq1 = _mm_and_si128(flat, oq1);
-    *q1 = _mm_or_si128(qs[1], oq1);
-
-    ps[0] = _mm_andnot_si128(flat, ps[0]);
-    op0 = _mm_and_si128(flat, op0);
-    *p0 = _mm_or_si128(ps[0], op0);
-
-    ps[1] = _mm_andnot_si128(flat, ps[1]);
-    op1 = _mm_and_si128(flat, op1);
-    *p1 = _mm_or_si128(ps[1], op1);
-
-    work_a = _mm_andnot_si128(flat, *q2);
-    *q2 = _mm_and_si128(flat, oq2);
-    *q2 = _mm_or_si128(work_a, *q2);
-
-    work_a = _mm_andnot_si128(flat, *p2);
-    *p2 = _mm_and_si128(flat, op2);
-    *p2 = _mm_or_si128(work_a, *p2);
-  } else {
-    *q0 = qs[0];
-    *q1 = qs[1];
-    *p0 = ps[0];
-    *p1 = ps[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-
-  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
-                             &p1p0, _blimit, _limit, _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-
-  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
-                                  _blimit0, _limit0, _thresh0, _blimit1,
-                                  _limit1, _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
-    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
-    const uint8_t *_thresh, int bd) {
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev;
-  __m128i p1p0, q1q0;
-  __m128i pq[2];
-
-  __m128i abs_p1p0;
-
-  __m128i t80;
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-
-  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
-    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i blimit0, limit0, thresh0;
-  __m128i mask, flat;
-  __m128i p[2], q[2];
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
-  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
-
-  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
-  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
-
-  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
-  const __m128i one = _mm_set1_epi16(1);
-
-  __m128i t80;
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  // filter_mask and hev_mask
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-  mask = _mm_max_epi16(flat, mask);
-
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  p[0] = *p0;
-  p[1] = *p1;
-  q[0] = *q0;
-  q[1] = *q1;
-
-  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-}
-
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p1p0, q1q0;
-  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
-  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
-                             _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-void aom_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i ps[2], qs[2];
-
-  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
-                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
-}
-
-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
-  __m128i p1p0, q1q0;
-  __m128i p1, q1;
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
-  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
-
-  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
-                             thresh, bd);
-
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
-
-  // transpose from 8x4 to 4x8
-  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i ps[2], qs[2];
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
-
-  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
-                               &d2, &d3);
-
-  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
-                                  thresh0, blimit1, limit1, thresh1, bd);
-
-  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
-                               &d3, &d4, &d5, &d6, &d7);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
-  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
-  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
-  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x3, x2, x1, x0, p0, q0;
-  __m128i p1p0, q1q0;
-
-  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
-  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-
-  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
-                             limit, thresh, bd);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_6_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i p0, q0, p1, q1, p2, q2;
-
-  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
-
-  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
-                           &p0, &q0, &q1, &q2, &d6, &d7);
-
-  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
-                                  _limit0, _thresh0, _blimit1, _limit1,
-                                  _thresh1, bd);
-
-  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
-  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
-  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
-  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p2, p1, p0, p3, q0;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
-  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
-  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
-
-  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  // Loop filtering
-  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
-                             &p1p0, blimit, limit, thresh, bd);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
-                               &d1, &d2, &d3);
-
-  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
-  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
-  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
-  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
-  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
-  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
-  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
-
-  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
-                           &d2, &d3, &d4, &d5, &d6, &d7);
-
-  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
-                                  blimit0, limit0, thresh0, blimit1, limit1,
-                                  thresh1, bd);
-
-  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
-                           &x2, &x3, &x4, &x5, &x6, &x7);
-
-  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
-  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
-  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
-  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
-  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
-  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
-  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
-  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
-}
-
-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int bd) {
-  __m128i q[7], p[7], pq[7];
-  __m128i p6, p5, p4, p3;
-  __m128i p6_2, p5_2, p4_2, p3_2;
-  __m128i d0, d1, d2, d3;
-  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
-
-  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
-  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
-  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
-  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-
-  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
-                               &p[3], &p[2], &p[1], &p[0]);
-
-  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
-  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
-  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
-  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-
-  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
-                               &q[3], &q[4], &q[5], &q[6], &d7_2);
-
-  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
-  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
-                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
-
-  q[0] = _mm_srli_si128(pq[0], 8);
-  q[1] = _mm_srli_si128(pq[1], 8);
-  q[2] = _mm_srli_si128(pq[2], 8);
-  q[3] = _mm_srli_si128(pq[3], 8);
-  q[4] = _mm_srli_si128(pq[4], 8);
-  q[5] = _mm_srli_si128(pq[5], 8);
-
-  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
-                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
-  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
-  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
-  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
-  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
-}
-
-void aom_highbd_lpf_vertical_14_dual_sse2(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i q[7], p[7];
-  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
-  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
-  __m128i d0, d7;
-  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
-
-  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
-  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
-  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
-  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
-  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
-  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
-  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
-
-  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
-                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
-
-  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
-  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
-  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
-  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
-  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
-  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
-  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
-
-  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
-                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
-                           &q[6], &d7);
-
-  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
-                                   limit1, thresh1, bd);
-
-  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
-                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
-                           &d6_out, &d7_out);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
-
-  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
-                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
-                           &d6_out, &d7_out);
-
-  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
-  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
-  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
-  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
-  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
-  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
-  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
-  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
deleted file mode 100644
index b9689202a..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
-  const __m128i sign = _mm_srai_epi16(*p, 15);
-  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
-  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
-  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
-}
-
-static INLINE void update_qp(__m256i *qp) {
-  int i;
-  for (i = 0; i < 5; ++i) {
-    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
-  }
-}
-
-static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
-                           const int16_t *quant_ptr, const int16_t *dequant_ptr,
-                           const int16_t *quant_shift_ptr, __m256i *qp) {
-  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
-  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
-  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
-  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
-  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
-  init_one_qp(&zbin, &qp[0]);
-  init_one_qp(&round, &qp[1]);
-  init_one_qp(&quant, &qp[2]);
-  init_one_qp(&dequant, &qp[3]);
-  init_one_qp(&quant_shift, &qp[4]);
-}
-
-// Note:
-// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
-// and right shift 16.  The output, 16 int32_t is save in *p.
-static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y,
-                                         __m256i *p) {
-  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
-  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
-  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
-  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
-
-  prod_lo = _mm256_srli_epi64(prod_lo, 16);
-  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
-  prod_lo = _mm256_and_si256(prod_lo, mask);
-  prod_hi = _mm256_srli_epi64(prod_hi, 16);
-
-  prod_hi = _mm256_slli_epi64(prod_hi, 32);
-  *p = _mm256_or_si256(prod_lo, prod_hi);
-}
-
-static INLINE void quantize(const __m256i *qp, __m256i *c,
-                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
-                            tran_low_t *dqcoeff, __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi32(*c);
-  const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]);
-  __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]);
-  flag2 = _mm256_or_si256(flag1, flag2);
-  const int32_t nzflag = _mm256_movemask_epi8(flag2);
-
-  if (LIKELY(nzflag)) {
-    __m256i q = _mm256_add_epi32(abs, qp[1]);
-    __m256i tmp;
-    mm256_mul_shift_epi32(&q, &qp[2], &tmp);
-    q = _mm256_add_epi32(tmp, q);
-
-    mm256_mul_shift_epi32(&q, &qp[4], &q);
-    __m256i dq = _mm256_mullo_epi32(q, qp[3]);
-
-    q = _mm256_sign_epi32(q, *c);
-    dq = _mm256_sign_epi32(dq, *c);
-    q = _mm256_and_si256(q, flag2);
-    dq = _mm256_and_si256(dq, flag2);
-
-    _mm256_storeu_si256((__m256i *)qcoeff, q);
-    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
-
-    const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
-    const __m128i zr = _mm_setzero_si128();
-    const __m128i lo = _mm_unpacklo_epi16(isc, zr);
-    const __m128i hi = _mm_unpackhi_epi16(isc, zr);
-    const __m256i iscan =
-        _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
-    const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
-    __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
-    cur_eob = _mm256_and_si256(cur_eob, nz);
-    *eob = _mm256_max_epi32(cur_eob, *eob);
-  } else {
-    const __m256i zero = _mm256_setzero_si256();
-    _mm256_storeu_si256((__m256i *)qcoeff, zero);
-    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
-  }
-}
-
-void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  (void)scan;
-  const unsigned int step = 8;
-
-  __m256i qp[5], coeff;
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
-  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-
-  __m256i eob = _mm256_setzero_si256();
-  quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-  coeff_ptr += step;
-  qcoeff_ptr += step;
-  dqcoeff_ptr += step;
-  iscan += step;
-  n_coeffs -= step;
-
-  update_qp(qp);
-
-  while (n_coeffs > 0) {
-    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-    quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-    coeff_ptr += step;
-    qcoeff_ptr += step;
-    dqcoeff_ptr += step;
-    iscan += step;
-    n_coeffs -= step;
-  }
-  {
-    __m256i eob_s;
-    eob_s = _mm256_shuffle_epi32(eob, 0xe);
-    eob = _mm256_max_epi16(eob, eob_s);
-    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-    eob = _mm256_max_epi16(eob, eob_s);
-    eob_s = _mm256_shufflelo_epi16(eob, 1);
-    eob = _mm256_max_epi16(eob, eob_s);
-    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                            _mm256_extractf128_si256(eob, 1));
-    *eob_ptr = _mm_extract_epi16(final_eob, 0);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
deleted file mode 100644
index 58e5f98e5..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-
-void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
-  __m128i zbins[2];
-  __m128i nzbins[2];
-
-  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
-                           (int)zbin_ptr[0]);
-  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  (void)scan;
-
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = ((int)count / 4) - 1; i >= 0; i--) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (test == 0xffff)
-      non_zero_regs--;
-    else
-      break;
-  }
-
-  // Quantization pass:
-  for (i = 0; i < non_zero_regs; i++) {
-    __m128i coeffs, coeffs_sign, tmp1, tmp2;
-    int test;
-    int abs_coeff[4];
-    int coeff_sign[4];
-
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    coeffs_sign = _mm_srai_epi32(coeffs, 31);
-    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-    tmp1 = _mm_or_si128(tmp1, tmp2);
-    test = _mm_movemask_epi8(tmp1);
-    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-    for (j = 0; j < 4; j++) {
-      if (test & (1 << (4 * j))) {
-        int k = 4 * i + j;
-        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-      }
-    }
-  }
-  *eob_ptr = eob_i + 1;
-}
-
-void aom_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  __m128i zbins[2];
-  __m128i nzbins[2];
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
-  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
-  zbins[1] = _mm_set1_epi32(zbin1_tmp);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = 0; i < n_coeffs / 4; i++) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-  }
-
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
-  for (i = 0; i < idx; i++) {
-    const int rc = idx_arr[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-  }
-  *eob_ptr = eob + 1;
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
deleted file mode 100644
index e0d22522d..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ /dev/null
@@ -1,296 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_4x2x4 5-6 0
-  movh                  m0, [srcq +%2*2]
-%if %1 == 1
-  movu                  m4, [ref1q+%3*2]
-  movu                  m5, [ref2q+%3*2]
-  movu                  m6, [ref3q+%3*2]
-  movu                  m7, [ref4q+%3*2]
-  movhps                m0, [srcq +%4*2]
-  movhps                m4, [ref1q+%5*2]
-  movhps                m5, [ref2q+%5*2]
-  movhps                m6, [ref3q+%5*2]
-  movhps                m7, [ref4q+%5*2]
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m4
-  psubusw               m2, m5
-  psubusw               m4, m0
-  psubusw               m5, m0
-  por                   m4, m3
-  por                   m5, m2
-  pmaddwd               m4, m1
-  pmaddwd               m5, m1
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m6
-  psubusw               m2, m7
-  psubusw               m6, m0
-  psubusw               m7, m0
-  por                   m6, m3
-  por                   m7, m2
-  pmaddwd               m6, m1
-  pmaddwd               m7, m1
-%else
-  movu                  m2, [ref1q+%3*2]
-  movhps                m0, [srcq +%4*2]
-  movhps                m2, [ref1q+%5*2]
-  mova                  m3, m0
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m4, m2
-
-  movu                  m2, [ref2q+%3*2]
-  mova                  m3, m0
-  movhps                m2, [ref2q+%5*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m5, m2
-
-  movu                  m2, [ref3q+%3*2]
-  mova                  m3, m0
-  movhps                m2, [ref3q+%5*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m6, m2
-
-  movu                  m2, [ref4q+%3*2]
-  mova                  m3, m0
-  movhps                m2, [ref4q+%5*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m7, m2
-%endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*4]
-  lea                ref1q, [ref1q+ref_strideq*4]
-  lea                ref2q, [ref2q+ref_strideq*4]
-  lea                ref3q, [ref3q+ref_strideq*4]
-  lea                ref4q, [ref4q+ref_strideq*4]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_8x2x4 5-6 0
-  ; 1st 8 px
-  mova                  m0, [srcq +%2*2]
-%if %1 == 1
-  movu                  m4, [ref1q+%3*2]
-  movu                  m5, [ref2q+%3*2]
-  movu                  m6, [ref3q+%3*2]
-  movu                  m7, [ref4q+%3*2]
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m4
-  psubusw               m2, m5
-  psubusw               m4, m0
-  psubusw               m5, m0
-  por                   m4, m3
-  por                   m5, m2
-  pmaddwd               m4, m1
-  pmaddwd               m5, m1
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m6
-  psubusw               m2, m7
-  psubusw               m6, m0
-  psubusw               m7, m0
-  por                   m6, m3
-  por                   m7, m2
-  pmaddwd               m6, m1
-  pmaddwd               m7, m1
-%else
-  mova                  m3, m0
-  movu                  m2, [ref1q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m4, m2
-  movu                  m2, [ref2q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m5, m2
-  movu                  m2, [ref3q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m6, m2
-  movu                  m2, [ref4q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m7, m2
-%endif
-
-  ; 2nd 8 px
-  mova                  m0, [srcq +(%4)*2]
-  mova                  m3, m0
-  movu                  m2, [ref1q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m4, m2
-  movu                  m2, [ref2q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m5, m2
-  movu                  m2, [ref3q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m6, m2
-  movu                  m2, [ref4q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*4]
-  lea                ref1q, [ref1q+ref_strideq*4]
-  lea                ref2q, [ref2q+ref_strideq*4]
-  lea                ref3q, [ref3q+ref_strideq*4]
-  lea                ref4q, [ref4q+ref_strideq*4]
-%endif
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m7, m2
-%endmacro
-
-; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_16x2x4 5-6 0
-  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
-  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
-%endmacro
-
-; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_32x2x4 5-6 0
-  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
-  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
-%endmacro
-
-; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_64x2x4 5-6 0
-  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
-  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
-%endmacro
-
-; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
-;                         uint8_t *ref[4], int ref_stride,
-;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
-%if UNIX64
-cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
-%else
-cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
-%endif
-
-; set m1
-  push                srcq
-  mov                 srcd, 0x00010001
-  movd                  m1, srcd
-  pshufd                m1, m1, 0x0
-  pop                 srcq
-
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  mov                ref2q, [ref1q+gprsize*1]
-  mov                ref3q, [ref1q+gprsize*2]
-  mov                ref4q, [ref1q+gprsize*3]
-  mov                ref1q, [ref1q+gprsize*0]
-
-; convert byte pointers to short pointers
-  shl                 srcq, 1
-  shl                ref2q, 1
-  shl                ref3q, 1
-  shl                ref4q, 1
-  shl                ref1q, 1
-
-  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
-  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
-  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
-  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
-  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
-  movhlps               m0, m4
-  movhlps               m1, m5
-  movhlps               m2, m6
-  movhlps               m3, m7
-  paddd                 m4, m0
-  paddd                 m5, m1
-  paddd                 m6, m2
-  paddd                 m7, m3
-  punpckldq             m4, m5
-  punpckldq             m6, m7
-  movhlps               m0, m4
-  movhlps               m1, m6
-  paddd                 m4, m0
-  paddd                 m6, m1
-  punpcklqdq            m4, m6
-  movifnidn             r4, r4mp
-  movu                [r4], m4
-  RET
-%endmacro
-
-
-INIT_XMM sse2
-HIGH_SADNXN4D 64, 64
-HIGH_SADNXN4D 64, 32
-HIGH_SADNXN4D 32, 64
-HIGH_SADNXN4D 32, 32
-HIGH_SADNXN4D 32, 16
-HIGH_SADNXN4D 16, 32
-HIGH_SADNXN4D 16, 16
-HIGH_SADNXN4D 16,  8
-HIGH_SADNXN4D  8, 16
-HIGH_SADNXN4D  8,  8
-HIGH_SADNXN4D  8,  4
-HIGH_SADNXN4D  4,  8
-HIGH_SADNXN4D  4,  4
-HIGH_SADNXN4D  4, 16
-HIGH_SADNXN4D 16,  4
-HIGH_SADNXN4D  8, 32
-HIGH_SADNXN4D 32,  8
-HIGH_SADNXN4D 16, 64
-HIGH_SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
deleted file mode 100644
index 3398d8a2a..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ /dev/null
@@ -1,374 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro HIGH_SAD_FN 4
-%if %4 == 0
-%if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
-                            src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-%else ; avg
-%if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
-                                    second_pred, n_rows
-%else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
-                                              ref, ref_stride, \
-                                              second_pred, \
-                                              src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; avg/sad
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-; convert src, ref & second_pred to short ptrs (from byte ptrs)
-  shl                 srcq, 1
-  shl                 refq, 1
-%if %4 == 1
-  shl         second_predq, 1
-%endif
-%endmacro
-
-; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD64XN 1-2 0
-  HIGH_SAD_FN 64, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  ; first half of each row
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+16]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+16]
-  por                   m2, m5
-  mova                  m5, [srcq+32]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+32]
-  por                   m3, m5
-  mova                  m5, [srcq+48]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+48]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  paddd                 m0, m1
-  paddd                 m0, m3
-  ; second half of each row
-  movu                  m1, [refq+64]
-  movu                  m2, [refq+80]
-  movu                  m3, [refq+96]
-  movu                  m4, [refq+112]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq+64]
-  psubusw               m5, m1
-  psubusw               m1, [srcq+64]
-  por                   m1, m5
-  mova                  m5, [srcq+80]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+80]
-  por                   m2, m5
-  mova                  m5, [srcq+96]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+96]
-  por                   m3, m5
-  mova                  m5, [srcq+112]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+112]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
-HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
-HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
-HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
-HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
-
-; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD32XN 1-2 0
-  HIGH_SAD_FN 32, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+16]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+16]
-  por                   m2, m5
-  mova                  m5, [srcq+32]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+32]
-  por                   m3, m5
-  mova                  m5, [srcq+48]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+48]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
-HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
-HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
-HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
-HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
-HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
-HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
-
-; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD16XN 1-2 0
-  HIGH_SAD_FN 16, %1, 5, %2
-  mov              n_rowsd, %1/2
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+ref_strideq*2]
-  movu                  m4, [refq+ref_strideq*2+16]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+16]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*2+16]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+16]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+16]
-  por                   m2, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*2]
-  por                   m3, m5
-  mova                  m5, [srcq+src_strideq*2+16]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_strideq*2+16]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
-HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
-HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
-HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
-HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
-HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
-HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
-HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
-HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
-
-; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+ref_strideq*2]
-  movu                  m3, [refq+ref_strideq*4]
-  movu                  m4, [refq+ref_stride3q*2]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
-  por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
-  por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*8]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*8]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
-HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
-HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
-HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
-HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
-HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
-HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
deleted file mode 100644
index 61f5b8e86..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ /dev/null
@@ -1,1036 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times  8 dw  8
-bilin_filter_m_sse2: times  8 dw 16
-                     times  8 dw  0
-                     times  8 dw 14
-                     times  8 dw  2
-                     times  8 dw 12
-                     times  8 dw  4
-                     times  8 dw 10
-                     times  8 dw  6
-                     times 16 dw  8
-                     times  8 dw  6
-                     times  8 dw 10
-                     times  8 dw  4
-                     times  8 dw 12
-                     times  8 dw  2
-                     times  8 dw 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
-;                               int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
-  psubw                %3, %4
-  psubw                %1, %2
-  mova                 %4, %3       ; make copies to manipulate to calc sum
-  mova                 %2, %1       ; use originals for calc sse
-  pmaddwd              %3, %3
-  paddw                %4, %2
-  pmaddwd              %1, %1
-  movhlps              %2, %4
-  paddd                %6, %3
-  paddw                %4, %2
-  pxor                 %2, %2
-  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
-  punpcklwd            %4, %2       ; sign-extend word to dword
-  paddd                %6, %1
-  paddd                %5, %4
-
-%endmacro
-
-%macro STORE_AND_RET 0
-%if mmsize == 16
-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
-  ; We have to sign-extend it before adding the words within the register
-  ; and outputing to a dword.
-  movhlps              m3, m7
-  movhlps              m4, m6
-  paddd                m7, m3
-  paddd                m6, m4
-  pshufd               m3, m7, 0x1
-  pshufd               m4, m6, 0x1
-  paddd                m7, m3
-  paddd                m6, m4
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  movd               [r1], m7           ; store sse
-  movd                eax, m6           ; store sum as return value
-%endif
-  RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  add                srcq, src_stridemp
-  add                srcq, src_stridemp
-%else
-  lea                srcq, [srcq + src_strideq*2]
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-
-
-%if ARCH_X86_64
-  %if %2 == 1 ; avg
-    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
-    %define sec_str sec_strideq
-  %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, height, sse
-  %endif
-  %define block_height heightd
-  %define bilin_filter sseq
-%else
-  %if CONFIG_PIC=1
-    %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                        x_offset, y_offset, \
-                                        dst, dst_stride, \
-                                        sec, sec_stride, height, sse, \
-                                        g_bilin_filter, g_pw_8
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %else
-      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                    x_offset, y_offset, \
-                                    dst, dst_stride, height, sse, \
-                                    g_bilin_filter, g_pw_8
-      %define block_height heightd
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
-  %else
-    %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                        x_offset, y_offset, \
-                                        dst, dst_stride, \
-                                        sec, sec_stride, height, sse
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-    %else
-      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                    x_offset, y_offset, \
-                                    dst, dst_stride, height, sse
-      %define block_height heightd
-    %endif
-
-    %define bilin_filter bilin_filter_m
-  %endif
-%endif
-
-  ASSERT               %1 <= 16         ; m6 overflows if w > 16
-  pxor                 m6, m6           ; sum
-  pxor                 m7, m7           ; sse
-
-%if %1 < 16
-  sar                   block_height, 1
-%endif
-%if %2 == 1 ; avg
-  shl             sec_str, 1
-%endif
-
-  ; FIXME(rbultje) replace by jumptable?
-  test          x_offsetd, x_offsetd
-  jnz .x_nonzero
-  ; x_offset == 0
-  test          y_offsetd, y_offsetd
-  jnz .x_zero_y_nonzero
-
-  ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq + 16]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + 16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m2, [secq+16]
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq + src_strideq*2]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_zero_y_zero_loop
-  STORE_AND_RET
-
-.x_zero_y_nonzero:
-  cmp           y_offsetd, 8
-  jne .x_zero_y_nonhalf
-
-  ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m4, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
-  pavgw                m0, m4
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*4]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
-  pavgw                m0, m1
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_zero_y_half_loop
-  STORE_AND_RET
-
-.x_zero_y_nonhalf:
-  ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+y_offsetq]
-  mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq + 16]
-  movu                 m4, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
-  ; slightly faster because of pmullw latency. It would also cut our rodata
-  ; tables in half for this function, and save 1-2 registers on x86-64.
-  pmullw               m1, filter_y_a
-  pmullw               m5, filter_y_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m1, m5
-  paddw                m0, m4
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*4]
-  mova                 m4, m1
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
-  pmullw               m1, filter_y_a
-  pmullw               m5, filter_y_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m1, m5
-  paddw                m0, m4
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonzero:
-  cmp           x_offsetd, 8
-  jne .x_nonhalf
-  ; x_offset == 0.5
-  test          y_offsetd, y_offsetd
-  jnz .x_half_y_nonzero
-
-  ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq + 16]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + 18]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + 16]
-  pavgw                m0, m4
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq + src_strideq*2]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + src_strideq*2 + 2]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
-  pavgw                m0, m4
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_half_y_zero_loop
-  STORE_AND_RET
-
-.x_half_y_nonzero:
-  cmp           y_offsetd, 8
-  jne .x_half_y_nonhalf
-
-  ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-  pavgw                m1, m3
-.x_half_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq + 16]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + 18]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  pavgw                m0, m2
-  pavgw                m1, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + 16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-  mova                 m0, m2
-  mova                 m1, m3
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-.x_half_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq + src_strideq*2]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + src_strideq*2 + 2]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  pavgw                m0, m2
-  pavgw                m2, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
-%endif
-  SUM_SSE              m0, m4, m2, m5, m6, m7
-  mova                 m0, m3
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_half_y_half_loop
-  STORE_AND_RET
-
-.x_half_y_nonhalf:
-  ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+y_offsetq]
-  mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else  ; x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-  pavgw                m1, m3
-.x_half_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+16]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+18]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m1, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m1, filter_rnd
-  paddw                m1, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  psrlw                m1, 4
-  paddw                m0, m2
-  mova                 m2, [dstq]
-  psrlw                m0, 4
-  mova                 m3, [dstq+16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-  mova                 m0, m4
-  mova                 m1, m5
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-.x_half_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+src_strideq*2+2]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m4, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m4, filter_rnd
-  paddw                m4, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  psrlw                m4, 4
-  paddw                m0, m2
-  mova                 m2, [dstq]
-  psrlw                m0, 4
-  mova                 m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
-%endif
-  SUM_SSE              m0, m2, m4, m3, m6, m7
-  mova                 m0, m5
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonhalf:
-  test          y_offsetd, y_offsetd
-  jnz .x_nonhalf_y_nonzero
-
-  ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+x_offsetq]
-  mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m1, m3
-  paddw                m0, m2
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+src_strideq*2]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+src_strideq*2+2]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m1, m3
-  paddw                m0, m2
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-
-  lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 8
-  jne .x_nonhalf_y_nonhalf
-
-  ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+x_offsetq]
-  mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m0, m2
-  paddw                m1, m3
-  psrlw                m0, 4
-  psrlw                m1, 4
-  lea                srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+16]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+18]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
-  psrlw                m2, 4
-  psrlw                m3, 4
-  pavgw                m0, m2
-  pavgw                m1, m3
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-  mova                 m0, m2
-  mova                 m1, m3
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m2
-  psrlw                m0, 4
-  lea                srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+src_strideq*2+2]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
-  psrlw                m2, 4
-  psrlw                m3, 4
-  pavgw                m0, m2
-  pavgw                m2, m3
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
-%endif
-  SUM_SSE              m0, m4, m2, m5, m6, m7
-  mova                 m0, m3
-
-  lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonhalf_y_nonhalf:
-; loading filter - this is same as in 8-bit depth
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+x_offsetq]
-  mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [bilin_filter+y_offsetq]
-  mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
-  mov tempq, g_bilin_filterm
-  add           x_offsetq, tempq
-  add           y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-  add           y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-; end of load filter
-
-  ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  movu                 m1, [srcq+16]
-  movu                 m3, [srcq+18]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m0, m2
-  paddw                m1, m3
-  psrlw                m0, 4
-  psrlw                m1, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m4, [srcq+2]
-  movu                 m3, [srcq+16]
-  movu                 m5, [srcq+18]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  psrlw                m2, 4
-  psrlw                m3, 4
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m1, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, m2
-  paddw                m1, filter_rnd
-  mova                 m2, [dstq]
-  paddw                m1, m3
-  psrlw                m0, 4
-  psrlw                m1, 4
-  mova                 m3, [dstq+16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-  mova                 m0, m4
-  mova                 m1, m5
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m2
-  psrlw                m0, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m4, [srcq+2]
-  INC_SRC_BY_SRC_STRIDE
-  movu                 m3, [srcq]
-  movu                 m5, [srcq+2]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  psrlw                m2, 4
-  psrlw                m3, 4
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m4, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, m2
-  paddw                m4, filter_rnd
-  mova                 m2, [dstq]
-  paddw                m4, m3
-  psrlw                m0, 4
-  psrlw                m4, 4
-  mova                 m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
-%endif
-  SUM_SSE              m0, m2, m4, m3, m6, m7
-  mova                 m0, m5
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET
-%endmacro
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
deleted file mode 100644
index 18eb03d12..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stddef.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
-                                    const uint16_t *src, ptrdiff_t src_stride,
-                                    const uint16_t *pred,
-                                    ptrdiff_t pred_stride);
-
-static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3;
-  __m128i x0, x1, x2, x3;
-  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-
-  _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *)(diff + 1 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *)(diff + 2 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *)(diff + 3 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x3);
-}
-
-static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-  x4 = _mm_sub_epi16(u4, v4);
-  x5 = _mm_sub_epi16(u5, v5);
-  x6 = _mm_sub_epi16(u6, v6);
-  x7 = _mm_sub_epi16(u7, v7);
-
-  _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *)(diff + 1 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *)(diff + 2 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *)(diff + 3 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x3);
-  store_diff = (int64_t *)(diff + 4 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x4);
-  store_diff = (int64_t *)(diff + 5 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x5);
-  store_diff = (int64_t *)(diff + 6 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x6);
-  store_diff = (int64_t *)(diff + 7 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x7);
-}
-
-static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3;
-  __m128i x0, x1, x2, x3;
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-
-  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-}
-
-static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-  x4 = _mm_sub_epi16(u4, v4);
-  x5 = _mm_sub_epi16(u5, v5);
-  x6 = _mm_sub_epi16(u6, v6);
-  x7 = _mm_sub_epi16(u7, v7);
-
-  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
-  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
-  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
-  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
-}
-
-#define STACK_V(h, fun)                                                        \
-  do {                                                                         \
-    fun(diff, diff_stride, src, src_stride, pred, pred_stride);                \
-    fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
-        pred + pred_stride * h, pred_stride);                                  \
-  } while (0)
-
-#define STACK_H(w, fun)                                                     \
-  do {                                                                      \
-    fun(diff, diff_stride, src, src_stride, pred, pred_stride);             \
-    fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
-  } while (0)
-
-#define SUBTRACT_FUN(size)                                               \
-  static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride,      \
-                              const uint16_t *src, ptrdiff_t src_stride, \
-                              const uint16_t *pred, ptrdiff_t pred_stride)
-
-SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
-SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
-SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
-SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
-SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
-SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
-SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
-SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
-SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
-SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
-SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
-SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
-SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
-SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
-SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
-SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
-SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
-SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
-
-static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
-  if (rows == 4) {
-    if (cols == 4) return subtract_4x4;
-    if (cols == 8) return subtract_8x4;
-    if (cols == 16) return subtract_16x4;
-  }
-  if (rows == 8) {
-    if (cols == 4) return subtract_4x8;
-    if (cols == 8) return subtract_8x8;
-    if (cols == 16) return subtract_16x8;
-    if (cols == 32) return subtract_32x8;
-  }
-  if (rows == 16) {
-    if (cols == 4) return subtract_4x16;
-    if (cols == 8) return subtract_8x16;
-    if (cols == 16) return subtract_16x16;
-    if (cols == 32) return subtract_32x16;
-    if (cols == 64) return subtract_64x16;
-  }
-  if (rows == 32) {
-    if (cols == 8) return subtract_8x32;
-    if (cols == 16) return subtract_16x32;
-    if (cols == 32) return subtract_32x32;
-    if (cols == 64) return subtract_64x32;
-  }
-  if (rows == 64) {
-    if (cols == 16) return subtract_16x64;
-    if (cols == 32) return subtract_32x64;
-    if (cols == 64) return subtract_64x64;
-    if (cols == 128) return subtract_128x64;
-  }
-  if (rows == 128) {
-    if (cols == 64) return subtract_64x128;
-    if (cols == 128) return subtract_128x128;
-  }
-  assert(0);
-  return NULL;
-}
-
-void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
-                                    ptrdiff_t diff_stride, const uint8_t *src8,
-                                    ptrdiff_t src_stride, const uint8_t *pred8,
-                                    ptrdiff_t pred_stride, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  SubtractWxHFuncType func;
-  (void)bd;
-
-  func = getSubtractFunc(rows, cols);
-  func(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
deleted file mode 100644
index 9b1b4c9de..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>  // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
-                                   const uint16_t *ref, int ref_stride,
-                                   uint32_t *sse, int *sum);
-
-void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
-                                const uint16_t *ref, int ref_stride,
-                                uint32_t *sse, int *sum) {
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-  for (int i = 0; i < 8; i += 2) {
-    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
-    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
-    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
-    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
-    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
-    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
-    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
-    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
-    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
-    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
-    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
-    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
-    src += src_stride * 2;
-    ref += ref_stride * 2;
-  }
-  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
-  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
-  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
-  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
-  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
-  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
-  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
-  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
-  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
-  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
-  *sum = _mm_extract_epi32(v_d, 0);
-  *sse = _mm_extract_epi32(v_d, 1);
-}
-
-void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
-                                  const uint16_t *ref, int ref_stride,
-                                  uint32_t *sse, int *sum) {
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  for (int i = 0; i < 16; ++i) {
-    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
-    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
-    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
-    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
-    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
-    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
-    src += src_stride;
-    ref += ref_stride;
-  }
-  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
-  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
-  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
-  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
-  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
-  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
-  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
-  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
-  *sum = _mm_extract_epi32(v_d, 0);
-  *sse = _mm_extract_epi32(v_d, 1);
-}
-
-static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int32_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-#define VAR_FN(w, h, block_size, shift)                                    \
-  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_10_variance_avx2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
-
-#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
deleted file mode 100644
index 0d954e178..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ /dev/null
@@ -1,318 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-;unsigned int aom_highbd_calc16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(aom_highbd_calc16x16var_sse2) PRIVATE
-sym(aom_highbd_calc16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-        add         rax,            rax ; source stride in bytes
-        add         rdx,            rdx ; recon stride in bytes
-
-        ; Prefetch data
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+16]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax+16]
-        lea             rbx,    [rsi+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax+16]
-
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+16]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx+16]
-        lea             rbx,    [rdi+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx+16]
-
-        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        lea             rbx,    [rsi+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax+16]
-        lea             rbx,    [rdi+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx+16]
-
-        pxor        xmm5,           xmm5
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+16]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+16]
-        paddd       xmm6,           xmm1
-
-        psubw       xmm3,           xmm2
-        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
-        paddd       xmm6,           xmm3
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
-        paddd       xmm6,           xmm1
-
-        psubw       xmm3,           xmm2
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        paddd       xmm6,           xmm3
-
-        movdqa      xmm1,           xmm5
-        movdqa      xmm2,           xmm5
-        pcmpgtw     xmm1,           xmm0
-        pcmpeqw     xmm2,           xmm0
-        por         xmm1,           xmm2
-        pcmpeqw     xmm1,           xmm0
-        movdqa      xmm2,           xmm5
-        punpcklwd   xmm5,           xmm1
-        punpckhwd   xmm2,           xmm1
-        paddd       xmm7,           xmm5
-        paddd       xmm7,           xmm2
-
-        lea         rsi,            [rsi + 2*rax]
-        lea         rdi,            [rdi + 2*rdx]
-        sub         rcx,            2
-        jnz         .var16loop
-
-        movdqa      xmm4,           xmm6
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm4,           xmm0
-        movdqa      xmm5,           xmm7
-
-        paddd       xmm6,           xmm4
-        punpckldq   xmm7,           xmm0
-
-        punpckhdq   xmm5,           xmm0
-        paddd       xmm7,           xmm5
-
-        movdqa      xmm4,           xmm6
-        movdqa      xmm5,           xmm7
-
-        psrldq      xmm4,           8
-        psrldq      xmm5,           8
-
-        paddd       xmm6,           xmm4
-        paddd       xmm7,           xmm5
-
-        mov         rdi,            arg(4)   ; [SSE]
-        mov         rax,            arg(5)   ; [Sum]
-
-        movd DWORD PTR [rdi],       xmm6
-        movd DWORD PTR [rax],       xmm7
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int aom_highbd_calc8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(aom_highbd_calc8x8var_sse2) PRIVATE
-sym(aom_highbd_calc8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-        add         rax,            rax ; source stride in bytes
-        add         rdx,            rdx ; recon stride in bytes
-
-        ; Prefetch data
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        lea             rbx,    [rsi+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        lea             rbx,    [rdi+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-
-        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
-        mov         rcx,            8
-
-.var8loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        lea             rbx,    [rbx+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        lea             rbx,    [rbx+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-
-        pxor        xmm5,           xmm5
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
-        paddd       xmm6,           xmm1
-
-        lea         rsi,            [rsi + 2*rax]
-        lea         rdi,            [rdi + 2*rdx]
-
-        psubw       xmm3,           xmm2
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-        paddd       xmm6,           xmm3
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
-        paddd       xmm6,           xmm1
-
-        psubw       xmm3,           xmm2
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        paddd       xmm6,           xmm3
-
-        movdqa      xmm1,           xmm5
-        movdqa      xmm2,           xmm5
-        pcmpgtw     xmm1,           xmm0
-        pcmpeqw     xmm2,           xmm0
-        por         xmm1,           xmm2
-        pcmpeqw     xmm1,           xmm0
-        movdqa      xmm2,           xmm5
-        punpcklwd   xmm5,           xmm1
-        punpckhwd   xmm2,           xmm1
-        paddd       xmm7,           xmm5
-        paddd       xmm7,           xmm2
-
-        lea         rsi,            [rsi + 2*rax]
-        lea         rdi,            [rdi + 2*rdx]
-        sub         rcx,            4
-        jnz         .var8loop
-
-        movdqa      xmm4,           xmm6
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm4,           xmm0
-        movdqa      xmm5,           xmm7
-
-        paddd       xmm6,           xmm4
-        punpckldq   xmm7,           xmm0
-
-        punpckhdq   xmm5,           xmm0
-        paddd       xmm7,           xmm5
-
-        movdqa      xmm4,           xmm6
-        movdqa      xmm5,           xmm7
-
-        psrldq      xmm4,           8
-        psrldq      xmm5,           8
-
-        paddd       xmm6,           xmm4
-        paddd       xmm7,           xmm5
-
-        mov         rdi,            arg(4)   ; [SSE]
-        mov         rax,            arg(5)   ; [Sum]
-
-        movd DWORD PTR [rdi],       xmm6
-        movd DWORD PTR [rax],       xmm7
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
deleted file mode 100644
index 47b052abc..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ /dev/null
@@ -1,868 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_ports/mem.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
-                                       const uint16_t *ref, int ref_stride,
-                                       uint32_t *sse, int *sum);
-
-uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    uint32_t *sse, int *sum);
-
-uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
-                                      const uint16_t *ref, int ref_stride,
-                                      uint32_t *sse, int *sum);
-
-static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
-                                   const uint16_t *ref, int ref_stride, int w,
-                                   int h, uint32_t *sse, int *sum,
-                                   high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int32_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int32_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
-#define HIGH_GET_VAR(S)                                                       \
-  void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                         const uint8_t *ref8, int ref_stride, \
-                                         uint32_t *sse, int *sum) {           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
-    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
-                                       sum);                                  \
-  }                                                                           \
-                                                                              \
-  void aom_highbd_10_get##S##x##S##var_sse2(                                  \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
-      int ref_stride, uint32_t *sse, int *sum) {                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
-    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
-                                       sum);                                  \
-    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
-    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
-  }                                                                           \
-                                                                              \
-  void aom_highbd_12_get##S##x##S##var_sse2(                                  \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
-      int ref_stride, uint32_t *sse, int *sum) {                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
-    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
-                                       sum);                                  \
-    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
-    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
-  }
-
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
-
-#undef HIGH_GET_VAR
-
-#define VAR_FN(w, h, block_size, shift)                                    \
-  uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_8_variance_sse2(                                                \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
-  }                                                                        \
-                                                                           \
-  uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_10_variance_sse2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }                                                                        \
-                                                                           \
-  uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_12_variance_sse2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
-
-#undef VAR_FN
-
-unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                        const uint8_t *ref8, int ref_stride,
-                                        unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                         aom_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                          aom_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                          aom_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                      const uint8_t *ref8, int ref_stride,
-                                      unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
-                         aom_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
-                          aom_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
-                          aom_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in
-// highbd_subpel_variance_impl_sse2.asm
-#define DECL(w, opt)                                                         \
-  int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
-      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
-      unsigned int *sse, void *unused0, void *unused);
-#define DECLS(opt) \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
-  uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    int64_t var;                                                               \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 2);                                            \
-    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    int start_row;                                                             \
-    uint32_t sse;                                                              \
-    int se = 0;                                                                \
-    int64_t var;                                                               \
-    uint64_t long_sse = 0;                                                     \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    for (start_row = 0; start_row < h; start_row += 16) {                      \
-      uint32_t sse2;                                                           \
-      int height = h - start_row < 16 ? h - start_row : 16;                    \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
-          NULL);                                                               \
-      se += se2;                                                               \
-      long_sse += sse2;                                                        \
-      if (w > wf) {                                                            \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        long_sse += sse2;                                                      \
-        if (w > wf * 2) {                                                      \
-          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 4);                                            \
-    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }
-
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
-  FN(64, 16, 16, 6, 4, opt, (int64_t))
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                         \
-  int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
-      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
-      void *unused);
-#define DECLS(opt) \
-  DECL(16, opt)    \
-  DECL(8, opt)
-
-DECLS(sse2);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
-      const uint8_t *sec8) {                                                   \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
-    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
-      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
-      const uint8_t *sec8) {                                                   \
-    int64_t var;                                                               \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
-    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
-      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 2);                                            \
-    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
-      const uint8_t *sec8) {                                                   \
-    int start_row;                                                             \
-    int64_t var;                                                               \
-    uint32_t sse;                                                              \
-    int se = 0;                                                                \
-    uint64_t long_sse = 0;                                                     \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
-    for (start_row = 0; start_row < h; start_row += 16) {                      \
-      uint32_t sse2;                                                           \
-      int height = h - start_row < 16 ? h - start_row : 16;                    \
-      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
-          w, height, &sse2, NULL, NULL);                                       \
-      se += se2;                                                               \
-      long_sse += sse2;                                                        \
-      if (w > wf) {                                                            \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
-            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
-        se += se2;                                                             \
-        long_sse += sse2;                                                      \
-        if (w > wf * 2) {                                                      \
-          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 4);                                            \
-    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }
-
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
-  FN(64, 16, 16, 6, 4, opt, (int64_t));
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
-
-void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
-                                    const struct AV1Common *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred8, int width, int height,
-                                    int subpel_x_q3, int subpel_y_q3,
-                                    const uint8_t *ref8, int ref_stride, int bd,
-                                    int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      /*Read 8 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 8) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
-          comp_pred += 8;
-          ref += 8;
-        }
-        ref += ref_stride - width;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      /*Read 4 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
-        comp_pred += 8;
-        ref += 2 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
-                               NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                              kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
-                               ref_stride, CONVERT_TO_BYTEPTR(temp),
-                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                               intermediate_height, bd);
-    aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
-        bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
-  assert(!(width * height & 7));
-  int n = width * height >> 3;
-  for (int i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
-
-static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
-                                               const __m128i *w0,
-                                               const __m128i *w1,
-                                               const __m128i *r,
-                                               void *const result) {
-  assert(DIST_PRECISION_BITS <= 4);
-  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
-  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
-  __m128i sum = _mm_adds_epu16(mult0, mult1);
-  __m128i round = _mm_adds_epu16(sum, *r);
-  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
-
-  xx_storeu_128(result, shift);
-}
-
-void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
-                                       const uint8_t *pred8, int width,
-                                       int height, const uint8_t *ref8,
-                                       int ref_stride,
-                                       const JNT_COMP_PARAMS *jcp_param) {
-  int i;
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-
-  if (width >= 8) {
-    // Read 8 pixels one row at a time
-    assert(!(width & 7));
-    for (i = 0; i < height; ++i) {
-      int j;
-      for (j = 0; j < width; j += 8) {
-        __m128i p0 = xx_loadu_128(ref);
-        __m128i p1 = xx_loadu_128(pred);
-
-        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
-
-        comp_pred += 8;
-        pred += 8;
-        ref += 8;
-      }
-      ref += ref_stride - width;
-    }
-  } else {
-    // Read 4 pixels two rows at a time
-    assert(!(width & 3));
-    for (i = 0; i < height; i += 2) {
-      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
-      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
-      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
-      __m128i p1 = xx_loadu_128(pred);
-
-      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
-
-      comp_pred += 8;
-      pred += 8;
-      ref += 2 * ref_stride;
-    }
-  }
-}
-
-void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  int n;
-  int i;
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  assert(!(width * height & 7));
-  n = width * height >> 3;
-
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred16);
-    __m128i p1 = xx_loadu_128(pred);
-
-    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
-
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
deleted file mode 100644
index df5449a9d..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h> /* SSE4.1 */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/variance.h"
-#include "aom_dsp/aom_filter.h"
-
-static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
-                                         const uint8_t *b8, int b_stride,
-                                         uint64_t *sse, int64_t *sum) {
-  __m128i u0, u1, u2, u3;
-  __m128i s0, s1, s2, s3;
-  __m128i t0, t1, x0, y0;
-  __m128i a0, a1, a2, a3;
-  __m128i b0, b1, b2, b3;
-  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
-  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
-  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
-  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
-
-  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
-  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
-  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
-  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
-
-  u0 = _mm_unpacklo_epi16(a0, a1);
-  u1 = _mm_unpacklo_epi16(a2, a3);
-  u2 = _mm_unpacklo_epi16(b0, b1);
-  u3 = _mm_unpacklo_epi16(b2, b3);
-
-  s0 = _mm_sub_epi16(u0, u2);
-  s1 = _mm_sub_epi16(u1, u3);
-
-  t0 = _mm_madd_epi16(s0, k_one_epi16);
-  t1 = _mm_madd_epi16(s1, k_one_epi16);
-
-  s2 = _mm_hadd_epi32(t0, t1);
-  s3 = _mm_hadd_epi32(s2, s2);
-  y0 = _mm_hadd_epi32(s3, s3);
-
-  t0 = _mm_madd_epi16(s0, s0);
-  t1 = _mm_madd_epi16(s1, s1);
-
-  s2 = _mm_hadd_epi32(t0, t1);
-  s3 = _mm_hadd_epi32(s2, s2);
-  x0 = _mm_hadd_epi32(s3, s3);
-
-  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
-  *sum = (int64_t)_mm_extract_epi32(y0, 0);
-}
-
-uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                         const uint8_t *b, int b_stride,
-                                         uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)local_sse;
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
-  sum = ROUND_POWER_OF_TWO(sum, 2);
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
-  sum = ROUND_POWER_OF_TWO(sum, 4);
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return diff >= 0 ? (uint32_t)diff : 0;
-}
-
-// Sub-pixel
-uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
-                                  sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
-                                   dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
-                                   dst_stride, sse);
-}
-
-// Sub-pixel average
-
-uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
-
-  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
-                                  sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
-
-  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
-                                   dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
-
-  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
-                                   dst_stride, sse);
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
deleted file mode 100644
index 1e67d392e..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c
+++ /dev/null
@@ -1,811 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE __m256i dc_sum_64(const uint8_t *ref) {
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i y0 = _mm256_sad_epu8(x0, zero);
-  __m256i y1 = _mm256_sad_epu8(x1, zero);
-  y0 = _mm256_add_epi64(y0, y1);
-  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
-  y0 = _mm256_add_epi64(u0, y0);
-  u0 = _mm256_unpackhi_epi64(y0, y0);
-  return _mm256_add_epi16(y0, u0);
-}
-
-static INLINE __m256i dc_sum_32(const uint8_t *ref) {
-  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i y = _mm256_sad_epu8(x, zero);
-  __m256i u = _mm256_permute2x128_si256(y, y, 1);
-  y = _mm256_add_epi64(u, y);
-  u = _mm256_unpackhi_epi64(y, y);
-  return _mm256_add_epi16(y, u);
-}
-
-static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
-                                  ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r);
-    dst += stride;
-  }
-}
-
-static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
-                                    int height, uint8_t *dst,
-                                    ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r0);
-    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
-    dst += stride;
-  }
-}
-
-static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
-                                  ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r);
-    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
-    dst += stride;
-  }
-}
-
-void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_32(above);
-  __m256i sum_left = dc_sum_32(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm256_srai_epi16(sum_left, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_32(left);
-  (void)above;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-// There are 32 rows togeter. This function does line:
-// 0,1,2,3, and 16,17,18,19. The next call would do
-// 4,5,6,7, and 20,21,22,23. So 4 times of calling
-// would finish 32 rows.
-static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
-                                        ptrdiff_t stride) {
-  __m256i t[4];
-  __m256i m = _mm256_setzero_si256();
-  const __m256i inc = _mm256_set1_epi8(4);
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    t[i] = _mm256_shuffle_epi8(*row, m);
-    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
-    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
-    _mm256_storeu_si256((__m256i *)dst, r0);
-    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
-    dst += stride;
-    m = _mm256_add_epi8(m, inc);
-  }
-}
-
-void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
-
-  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
-
-  __m256i v = _mm256_unpacklo_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  v = _mm256_unpackhi_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  u = _mm256_unpackhi_epi8(left_col, left_col);
-
-  v = _mm256_unpacklo_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  v = _mm256_unpackhi_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// Rectangle
-
-// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
-// Use a header file, intrapred_common_x86.h
-static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i top_sum = dc_sum_32_sse2(above);
-  __m128i left_sum = dc_sum_16_sse2(left);
-  left_sum = _mm_add_epi16(top_sum, left_sum);
-  uint32_t sum = _mm_cvtsi128_si32(left_sum);
-  sum += 24;
-  sum /= 48;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_32(above);
-  __m256i sum_left = dc_sum_64(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 48;
-  sum /= 96;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = dc_sum_64(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 64;
-  sum /= 128;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = dc_sum_32(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 48;
-  sum /= 96;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 40;
-  sum /= 80;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i sum = dc_sum_16_sse2(left);
-  (void)above;
-
-  const __m128i eight = _mm_set1_epi16(8);
-  sum = _mm_add_epi16(sum, eight);
-  sum = _mm_srai_epi16(sum, 4);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i r = _mm_shuffle_epi8(sum, zero);
-  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_64(left);
-  (void)above;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_64(left);
-  (void)above;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_32(left);
-  (void)above;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i sum = dc_sum_16_sse2(left);
-  (void)above;
-
-  const __m128i eight = _mm_set1_epi16(8);
-  sum = _mm_add_epi16(sum, eight);
-  sum = _mm_srai_epi16(sum, 4);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i r = _mm_shuffle_epi8(sum, zero);
-  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 64, dst, stride);
-}
-
-void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 32, dst, stride);
-}
-
-void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 16 16-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
-                                 const __m256i *topleft) {
-  const __m256i base =
-      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
-
-  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
-  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
-  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
-
-  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
-  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
-  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
-
-  pl = _mm256_andnot_si256(mask1, *left);
-
-  ptl = _mm256_and_si256(mask2, *topleft);
-  pt = _mm256_andnot_si256(mask2, *top);
-  pt = _mm256_or_si256(pt, ptl);
-  pt = _mm256_and_si256(mask1, pt);
-
-  return _mm256_or_si256(pt, pl);
-}
-
-// Return 16 8-bit pixels in one row (__m128i)
-static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
-                                      const __m256i *topleft) {
-  const __m256i p0 = paeth_pred(left, top, topleft);
-  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i p = _mm256_packus_epi16(p0, p1);
-  return _mm256_castsi256_si128(p);
-}
-
-static INLINE __m256i get_top_vector(const uint8_t *above) {
-  const __m128i x = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
-  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
-}
-
-void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i x = _mm_loadl_epi64((const __m128i *)left);
-  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-static INLINE __m256i get_left_vector(const uint8_t *left) {
-  const __m128i x = _mm_load_si128((const __m128i *)left);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-}
-
-void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-
-  l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  for (int j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-      _mm_store_si128((__m128i *)dst, row);
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-// Return 32 8-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
-                                      const __m256i *top1,
-                                      const __m256i *topleft) {
-  __m256i p0 = paeth_pred(left, top0, topleft);
-  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i x0 = _mm256_packus_epi16(p0, p1);
-
-  p0 = paeth_pred(left, top1, topleft);
-  p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i x1 = _mm256_packus_epi16(p0, p1);
-
-  return _mm256_permute2x128_si256(x0, x1, 0x20);
-}
-
-void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i l = get_left_vector(left);
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
-
-    _mm256_storeu_si256((__m256i *)dst, r);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m256i l = get_left_vector(left);
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-
-  l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 2; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  const __m256i l = get_left_vector(left);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-    _mm_store_si128((__m128i *)(dst + 32), r2);
-    _mm_store_si128((__m128i *)(dst + 48), r3);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
deleted file mode 100644
index 5b2452c8e..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c
+++ /dev/null
@@ -1,1430 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
-                                ptrdiff_t stride) {
-  for (int i = 0; i < height; i += 2) {
-    *(uint32_t *)dst = dc;
-    dst += stride;
-    *(uint32_t *)dst = dc;
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
-                                ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_storel_epi64((__m128i *)dst, *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    _mm_store_si128((__m128i *)(dst + 16), *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    _mm_store_si128((__m128i *)(dst + 16), *row);
-    _mm_store_si128((__m128i *)(dst + 32), *row);
-    _mm_store_si128((__m128i *)(dst + 48), *row);
-    dst += stride;
-  }
-}
-
-static INLINE __m128i dc_sum_4(const uint8_t *ref) {
-  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_unpacklo_epi8(x, zero);
-  return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_8(const uint8_t *ref) {
-  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_16(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-static INLINE __m128i dc_sum_64(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
-  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x2 = _mm_sad_epu8(x2, zero);
-  x3 = _mm_sad_epu8(x3, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  x2 = _mm_add_epi16(x2, x3);
-  x0 = _mm_add_epi16(x0, x2);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE int divide_using_multiply_shift(int num, int shift1,
-                                              int multiplier) {
-  const int interm = num >> shift1;
-  return interm * multiplier >> DC_SHIFT2;
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_4(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 6;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_4(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 10;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 6;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 12;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 20;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 10;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 12;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 24;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_64(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 40;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_8(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 20;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_16(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 24;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_64(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 48;
-  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_64(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 64;
-  sum /= 128;
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_32(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 48;
-  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_16(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 40;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
-  sum_above = _mm_add_epi16(sum_above, two);
-  sum_above = _mm_srai_epi16(sum_above, 2);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  sum_above = _mm_packus_epi16(sum_above, sum_above);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
-  sum_above = _mm_add_epi16(sum_above, two);
-  sum_above = _mm_srai_epi16(sum_above, 2);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  sum_above = _mm_packus_epi16(sum_above, sum_above);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  sum_left = _mm_packus_epi16(sum_left, sum_left);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  sum_left = _mm_packus_epi16(sum_left, sum_left);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
-  sum_left = _mm_add_epi16(sum_left, two);
-  sum_left = _mm_srai_epi16(sum_left, 2);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
-  sum_left = _mm_add_epi16(sum_left, two);
-  sum_left = _mm_srai_epi16(sum_left, 2);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const uint32_t pred = 0x80808080;
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const uint32_t pred = 0x80808080;
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const uint32_t pred = *(uint32_t *)above;
-  (void)left;
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const uint32_t pred = *(uint32_t *)above;
-  (void)left;
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, int height) {
-  const __m128i row0 = _mm_load_si128((__m128i const *)above);
-  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, row0);
-    _mm_store_si128((__m128i *)(dst + 16), row1);
-    dst += stride;
-  }
-}
-
-void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 8);
-}
-
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 16);
-}
-
-void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 64);
-}
-
-static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, int height) {
-  const __m128i row0 = _mm_load_si128((__m128i const *)above);
-  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
-  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
-  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, row0);
-    _mm_store_si128((__m128i *)(dst + 16), row1);
-    _mm_store_si128((__m128i *)(dst + 32), row2);
-    _mm_store_si128((__m128i *)(dst + 48), row3);
-    dst += stride;
-  }
-}
-
-void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 64);
-}
-
-void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 32);
-}
-
-void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 16);
-}
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-  left_col = _mm_unpackhi_epi64(left_col, left_col);
-  row0 = _mm_shufflelo_epi16(left_col, 0);
-  row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_load_si128((__m128i const *)left);
-  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
-  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
-  row0 = _mm_shufflelo_epi16(left_col_low, 0);
-  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  row0 = _mm_shufflelo_epi16(left_col_high, 0);
-  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
-  row0 = _mm_shufflelo_epi16(left_col_high, 0);
-  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above, const uint8_t *left,
-                                      int count) {
-  (void)above;
-  for (int i = 0; i < count; ++i) {
-    const __m128i left_col = _mm_load_si128((__m128i const *)left);
-    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
-    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
-    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
-    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
-    row0 = _mm_shufflelo_epi16(left_col_low, 0);
-    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    row0 = _mm_shufflelo_epi16(left_col_high, 0);
-    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
-    row0 = _mm_shufflelo_epi16(left_col_high, 0);
-    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-    left += 16;
-  }
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  h_predictor_8x16xc(dst, stride, above, left, 1);
-}
-
-void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  h_predictor_8x16xc(dst, stride, above, left, 2);
-}
-
-static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
-    dst += stride;
-  }
-}
-
-static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
-  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
-  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
-  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
-  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
-
-  row[0] = _mm_unpacklo_epi64(u0, u0);
-  row[1] = _mm_unpacklo_epi64(u1, u1);
-  row[2] = _mm_unpacklo_epi64(u2, u2);
-  row[3] = _mm_unpacklo_epi64(u3, u3);
-}
-
-static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
-  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
-  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
-  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
-  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
-
-  row[0] = _mm_unpackhi_epi64(u0, u0);
-  row[1] = _mm_unpackhi_epi64(u1, u1);
-  row[2] = _mm_unpackhi_epi64(u2, u2);
-  row[3] = _mm_unpackhi_epi64(u3, u3);
-}
-
-// Process 16x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_low_4pixels(left, row);
-  h_pred_store_16xh(row, 4, dst, stride);
-}
-
-// Process 16x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_high_4pixels(left, row);
-  h_pred_store_16xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_16x8_1(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_16x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_16x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int count) {
-  int i = 0;
-  do {
-    const __m128i left_col = _mm_load_si128((const __m128i *)left);
-    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
-    dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
-    dst += stride << 2;
-
-    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
-    dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
-    dst += stride << 2;
-
-    left += 16;
-    i++;
-  } while (i < count);
-}
-
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_16xh(dst, stride, left, 2);
-}
-
-void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_16xh(dst, stride, left, 4);
-}
-
-static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
-    _mm_store_si128((__m128i *)(dst + 16), row[i]);
-    dst += stride;
-  }
-}
-
-// Process 32x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_low_4pixels(left, row);
-  h_pred_store_32xh(row, 4, dst, stride);
-}
-
-// Process 32x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_high_4pixels(left, row);
-  h_pred_store_32xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
-
-  left_col = _mm_load_si128((const __m128i *)left);
-
-  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
-
-  left_col = _mm_load_si128((const __m128i *)left);
-
-  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-  dst += stride << 2;
-
-  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int height) {
-  int i = height >> 2;
-  do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
-    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r0);
-    _mm_store_si128((__m128i *)(dst + stride), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
-    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
-    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
-    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
-    left += 4;
-    dst += stride * 4;
-  } while (--i);
-}
-
-void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_32xh(dst, stride, left, 64);
-}
-
-static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int height) {
-  int i = height >> 2;
-  do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
-    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r0);
-    _mm_store_si128((__m128i *)(dst + 32), r0);
-    _mm_store_si128((__m128i *)(dst + 48), r0);
-    _mm_store_si128((__m128i *)(dst + stride), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
-    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
-    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
-    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
-    left += 4;
-    dst += stride * 4;
-  } while (--i);
-}
-
-void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 64);
-}
-
-void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 32);
-}
-
-void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 16);
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
deleted file mode 100644
index 9aece27be..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
+++ /dev/null
@@ -1,625 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4:  times 8 dw 2
-pw2_8:  times 8 dw 4
-pw2_16:  times 8 dw 8
-pw2_32:  times 8 dw 16
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movd                  m2, [leftq]
-  movd                  m0, [aboveq]
-  pxor                  m1, m1
-  punpckldq             m0, m2
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [leftq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [aboveq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_8)]
-  psraw                 m0, 4
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movd     m0,        [GLOBAL(dc_128)]
-  movd    [dstq          ], m0
-  movd    [dstq+strideq  ], m0
-  movd    [dstq+strideq*2], m0
-  movd    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    m0,        [GLOBAL(dc_128)]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_16)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  psadbw                m3, m1
-  psadbw                m4, m1
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_32)]
-  psraw                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  mova                  m2, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movd                  m0, [aboveq]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m1
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m1
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m1
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m1
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0
-  pshufd                m1, m0, 0x1
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m1
-  pshufd                m2, m0, 0x2
-  lea                 dstq, [dstq+strideq*2]
-  pshufd                m3, m0, 0x3
-  movd      [dstq        ], m2
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -2
-  DEFINE_ARGS  dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-  movq                  m0, [leftq    ]
-  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
-.loop:
-  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
-  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  pshuflw               m1, m0, 0xaa
-  pshuflw               m2, m0, 0xff
-  movq    [dstq+strideq*2], m1
-  movq    [dstq+stride3q ], m2
-  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
-  inc                lineq
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -4
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-.loop:
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
-  mova    [dstq          ], m1
-  mova    [dstq+strideq  ], m2
-  pshufd            m1, m0, 0xaa
-  pshufd            m2, m0, 0xff
-  mova    [dstq+strideq*2], m1
-  mova    [dstq+stride3q ], m2
-  inc                lineq
-  lea                leftq, [leftq+4       ]
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
-  movifnidn              leftq, leftmp
-  mov                    lineq, -8
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea                 stride3q, [strideq*3]
-.loop:
-  movd                      m0, [leftq]
-  punpcklbw                 m0, m0
-  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
-  mova     [dstq             ], m1
-  mova     [dstq+16          ], m1
-  mova     [dstq+strideq     ], m2
-  mova     [dstq+strideq+16  ], m2
-  pshufd                m1, m0, 0xaa
-  pshufd                m2, m0, 0xff
-  mova     [dstq+strideq*2   ], m1
-  mova     [dstq+strideq*2+16], m1
-  mova     [dstq+stride3q    ], m2
-  mova     [dstq+stride3q+16 ], m2
-  inc                    lineq
-  lea                    leftq, [leftq+4       ]
-  lea                     dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
deleted file mode 100644
index 807ed1770..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ /dev/null
@@ -1,1692 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/intrapred_common.h"
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 8 16-bit pixels in one row
-static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
-                                     const __m128i *topleft) {
-  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
-
-  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
-  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
-  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
-
-  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
-  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
-  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
-
-  pl = _mm_andnot_si128(mask1, *left);
-
-  ptl = _mm_and_si128(mask2, *topleft);
-  pt = _mm_andnot_si128(mask2, *top);
-  pt = _mm_or_si128(pt, ptl);
-  pt = _mm_and_si128(mask1, pt);
-
-  return _mm_or_si128(pl, pt);
-}
-
-void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int j = 0; j < 2; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m128i l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-// Return 16 8-bit pixels in one row
-static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
-                                      const __m128i *top1,
-                                      const __m128i *topleft) {
-  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
-  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
-  return _mm_packus_epi16(p0, p1);
-}
-
-void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-
-  l = _mm_load_si128((const __m128i *)(left + 16));
-  rep = _mm_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m128i l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-      _mm_store_si128((__m128i *)dst, row);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  __m128i l16;
-
-  for (int i = 0; i < 8; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-
-  rep = _mm_set1_epi16(0x8000);
-  l = _mm_load_si128((const __m128i *)(left + 16));
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r32l);
-      _mm_store_si128((__m128i *)(dst + 16), r32h);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 2; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i;
-  const __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-    _mm_store_si128((__m128i *)(dst + 32), r2);
-    _mm_store_si128((__m128i *)(dst + 48), r3);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_PRED
-
-// pixels[0]: above and below_pred interleave vector
-// pixels[1]: left vector
-// pixels[2]: right_pred vector
-static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
-                                 int height, __m128i *pixels) {
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  if (height == 4)
-    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
-
-  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
-  weight_h[0] = _mm_unpacklo_epi8(t, zero);
-  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-
-  if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  }
-}
-
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
-                                   const __m128i *ww, int h, uint8_t *dst,
-                                   ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
-    b = _mm_unpacklo_epi16(b, pixel[2]);
-    __m128i sum = _mm_madd_epi16(b, ww[0]);
-
-    sum = _mm_add_epi32(s, sum);
-    sum = _mm_add_epi32(sum, round);
-    sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 4, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 8, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 16, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-// pixels[2]: left vector
-// pixels[3]: right_pred vector
-// pixels[4]: above and below_pred interleave vector, first half
-// pixels[5]: above and below_pred interleave vector, second half
-// pixels[6]: left vector + 16
-// pixels[7]: right_pred vector
-static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
-                                 int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
-
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[2] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[2] = _mm_load_si128((const __m128i *)left);
-    pixels[4] = pixels[0];
-    pixels[5] = pixels[1];
-    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[7] = pixels[3];
-  }
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  const int we_offset = height < 8 ? 4 : 8;
-  __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
-  weight_h[0] = _mm_unpacklo_epi8(we, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-
-  if (height == 4) {
-    we = _mm_srli_si128(we, 4);
-    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-  } else {
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-  }
-
-  if (height == 16) {
-    we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(we, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(we, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else if (height == 32) {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                   const __m128i *ww, int h, uint8_t *dst,
-                                   ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  int i;
-  for (i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
-    b = _mm_unpacklo_epi16(b, pixels[3]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 4, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 8, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 16, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[8];
-  load_pixel_w8(above, left, 32, pixels);
-
-  __m128i wh[8], ww[2];
-  load_weight_w8(sm_weight_arrays, 32, wh, ww);
-
-  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left, uint32_t bw,
-                                        uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i top_right =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
-    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
-    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
-    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
-      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
-      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
-
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      const __m128i scale_m_weights_x =
-          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
-      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
-      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
-      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
-      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
-
-      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
-      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
-
-      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
-      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_V_PRED
-
-// pixels[0]: above and below_pred interleave vector
-static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vector
-static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height == 4) {
-    const __m128i weight =
-        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-  } else if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-  } else {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-    weights[2] = _mm_unpackhi_epi8(weight, zero);
-    weights[3] = _mm_sub_epi16(d, weights[2]);
-  }
-}
-
-static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 4, &pixels);
-
-  __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 4, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 8, &pixels);
-
-  __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 8, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 16, &pixels);
-
-  __m128i weights[4];
-  load_weight_v_w4(sm_weight_arrays, 16, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_h) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height < 16) {
-    const int offset = height < 8 ? 4 : 8;
-    const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                     int h, uint8_t *dst, ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    s0 = _mm_add_epi32(s0, pred_round);
-    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, pred_round);
-    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
-
-    __m128i sum01 = _mm_packus_epi16(s0, s1);
-    sum01 = _mm_shuffle_epi8(sum01, gat);
-    _mm_storel_epi64((__m128i *)dst, sum01);
-    dst += stride;
-
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 4, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 4, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 8, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 8, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 16, pixels);
-
-  __m128i wh[4];
-  load_weight_v_w8(sm_weight_arrays, 16, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 32, pixels);
-
-  __m128i wh[8];
-  load_weight_v_w8(sm_weight_arrays, 32, wh);
-
-  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
-}
-
-static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i bottom_left =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round =
-      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i scale_m_weights_y =
-        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      // 8 -> 16
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
-      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
-      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
-      // top_x * weights_y + scale_m_weights_y * bottom_left
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      pred_lo = _mm_add_epi32(pred_lo, round);
-      pred_hi = _mm_add_epi32(pred_hi, round);
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_H_PRED
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  if (height == 4)
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
-  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
-}
-
-// weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
-  (void)height;
-  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
-  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
-}
-
-static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
-    b = _mm_unpacklo_epi16(b, pixel[1]);
-    __m128i sum = _mm_madd_epi16(b, weight[0]);
-
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 4, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 4, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 8, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 16, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-  dst += stride << 3;
-
-  pixels[0] = _mm_srli_si128(pixels[0], 8);
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-// pixels[2]: left vector + 16
-// pixels[3]: right_pred vector
-static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[3] = pixels[1];
-  }
-}
-
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_w) {
-  (void)height;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-}
-
-static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
-                                     int h, uint8_t *dst, ptrdiff_t stride,
-                                     int second_half) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
-    b = _mm_unpacklo_epi16(b, pixels[1]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    sum0 = _mm_add_epi32(sum0, pred_round);
-    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
-
-    sum1 = _mm_add_epi32(sum1, pred_round);
-    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(sum0, sum1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 4, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 4, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 8, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 8, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 16, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 16, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_h_w8(above, left, 32, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 32, ww);
-
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i tr_ly =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
-      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
-      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
-      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
-      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
-      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_round);
-      pred_hi = _mm_add_epi32(pred_hi, pred_round);
-
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
deleted file mode 100644
index 0bc841a7a..000000000
--- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
+++ /dev/null
@@ -1,107 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro REORDER_INPUTS 0
-  ; a c d b  to  a b c d
-  SWAP 1, 3, 2
-%endmacro
-
-%macro TRANSFORM_COLS 0
-  ; input:
-  ; m0 a
-  ; m1 b
-  ; m2 c
-  ; m3 d
-  paddw           m0,        m2
-  psubw           m3,        m1
-
-  ; wide subtract
-  punpcklwd       m4,        m0
-  punpcklwd       m5,        m3
-  psrad           m4,        16
-  psrad           m5,        16
-  psubd           m4,        m5
-  psrad           m4,        1
-  packssdw        m4,        m4             ; e
-
-  psubw           m5,        m4,        m1  ; b
-  psubw           m4,        m2             ; c
-  psubw           m0,        m5
-  paddw           m3,        m4
-                                ; m0 a
-  SWAP            1,         5  ; m1 b
-  SWAP            2,         4  ; m2 c
-                                ; m3 d
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  punpcklwd       m0,        m2
-  punpcklwd       m1,        m3
-  mova            m2,        m0
-  punpcklwd       m0,        m1
-  punpckhwd       m2,        m1
-  pshufd          m1,        m0, 0x0e
-  pshufd          m3,        m2, 0x0e
-%endmacro
-
-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
-%macro TRANSPOSE_4X4_WIDE 0
-  mova            m3, m0
-  punpcklwd       m0, m1
-  punpckhwd       m3, m1
-  mova            m2, m0
-  punpcklwd       m0, m3
-  punpckhwd       m2, m3
-  pshufd          m1, m0, 0x0e
-  pshufd          m3, m2, 0x0e
-%endmacro
-
-%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
-  movd            m%3,       [outputq]
-  movd            m%4,       [outputq + strideq]
-  punpcklbw       m%3,       m%5
-  punpcklbw       m%4,       m%5
-  paddw           m%1,       m%3
-  paddw           m%2,       m%4
-  packuswb        m%1,       m%5
-  packuswb        m%2,       m%5
-  movd            [outputq], m%1
-  movd            [outputq + strideq], m%2
-%endmacro
-
-INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-  psraw           m0,        2
-  psraw           m1,        2
-
-  TRANSPOSE_4X4_WIDE
-  REORDER_INPUTS
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  REORDER_INPUTS
-  TRANSFORM_COLS
-
-  pxor            m4, m4
-  ADD_STORE_4P_2X  0, 1, 5, 6, 4
-  lea             outputq, [outputq + 2 * strideq]
-  ADD_STORE_4P_2X  2, 3, 5, 6, 4
-
-  RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
deleted file mode 100644
index c3c88245a..000000000
--- a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int width, int height) {
-  int i;
-  assert(width == 4);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; i += 4) {
-    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
-    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
-    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
-    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
-    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
-    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
-
-    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
-
-    x0 = xx_loadl_32(b + 0 * b_stride);
-    x1 = xx_loadl_32(b + 1 * b_stride);
-    x2 = xx_loadl_32(b + 2 * b_stride);
-    x3 = xx_loadl_32(b + 3 * b_stride);
-    x_lo = _mm_unpacklo_epi32(x0, x1);
-    x_hi = _mm_unpacklo_epi32(x2, x3);
-
-    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
-
-    __m128i sad4x4 = _mm_sad_epu8(x, y);
-    sad = _mm_add_epi32(sad, sad4x4);
-
-    a += 4 * a_stride;
-    b += 4 * b_stride;
-  }
-
-  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int width, int height) {
-  int i;
-  assert(width == 8);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; i += 2) {
-    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
-    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
-
-    __m128i x = _mm_unpacklo_epi64(x0, x1);
-
-    x0 = xx_loadl_64(b + 0 * b_stride);
-    x1 = xx_loadl_64(b + 1 * b_stride);
-
-    __m128i y = _mm_unpacklo_epi64(x0, x1);
-
-    __m128i sad8x2 = _mm_sad_epu8(x, y);
-    sad = _mm_add_epi32(sad, sad8x2);
-
-    a += 2 * a_stride;
-    b += 2 * b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int i;
-  assert(width == 16);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    __m128i x = xx_loadu_128(a);
-    __m128i y = xx_loadu_128(b);
-
-    __m128i sad16x1 = _mm_sad_epu8(x, y);
-    sad = _mm_add_epi32(sad, sad16x1);
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int i, j;
-  assert(width == 32);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < 2; ++j) {
-      __m128i x = xx_loadu_128(a + j * 16);
-      __m128i y = xx_loadu_128(b + j * 16);
-
-      __m128i sad32_half = _mm_sad_epu8(x, y);
-      sad = _mm_add_epi32(sad, sad32_half);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int i, j;
-  assert(width == 64);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < 4; ++j) {
-      __m128i x = xx_loadu_128(a + j * 16);
-      __m128i y = xx_loadu_128(b + j * 16);
-
-      __m128i sad64_quarter = _mm_sad_epu8(x, y);
-      sad = _mm_add_epi32(sad, sad64_quarter);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
-  int i, j;
-  assert(width == 128);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < 8; ++j) {
-      __m128i x = xx_loadu_128(a + j * 16);
-      __m128i y = xx_loadu_128(b + j * 16);
-
-      __m128i sad64_quarter = _mm_sad_epu8(x, y);
-      sad = _mm_add_epi32(sad, sad64_quarter);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-#define jnt_sadMxN_sse2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
-    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
-  }
-
-#define jnt_sadMxN_avx2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
-    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
-  }
-
-/* clang-format off */
-jnt_sadMxN_sse2(128, 128)
-jnt_sadMxN_sse2(128, 64)
-jnt_sadMxN_sse2(64, 128)
-jnt_sadMxN_sse2(64, 64)
-jnt_sadMxN_sse2(64, 32)
-jnt_sadMxN_sse2(32, 64)
-jnt_sadMxN_sse2(32, 32)
-jnt_sadMxN_sse2(32, 16)
-jnt_sadMxN_sse2(16, 32)
-jnt_sadMxN_sse2(16, 16)
-jnt_sadMxN_sse2(16, 8)
-jnt_sadMxN_sse2(8, 16)
-jnt_sadMxN_sse2(8, 8)
-jnt_sadMxN_sse2(8, 4)
-jnt_sadMxN_sse2(4, 8)
-jnt_sadMxN_sse2(4, 4)
-jnt_sadMxN_sse2(4, 16)
-jnt_sadMxN_sse2(16, 4)
-jnt_sadMxN_sse2(8, 32)
-jnt_sadMxN_sse2(32, 8)
-jnt_sadMxN_sse2(16, 64)
-jnt_sadMxN_sse2(64, 16)
-    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
deleted file mode 100644
index f9a41a210..000000000
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
-                                        const __m128i *w, const __m128i *r,
-                                        void *const result) {
-  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
-  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
-  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
-  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
-
-  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
-  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
-  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
-  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
-
-  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
-}
-
-void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                                 int width, int height, const uint8_t *ref,
-                                 int ref_stride,
-                                 const JNT_COMP_PARAMS *jcp_param) {
-  int i;
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  if (width >= 16) {
-    // Read 16 pixels one row at a time
-    assert(!(width & 15));
-    for (i = 0; i < height; ++i) {
-      int j;
-      for (j = 0; j < width; j += 16) {
-        __m128i p0 = xx_loadu_128(ref);
-        __m128i p1 = xx_loadu_128(pred);
-
-        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-        comp_pred += 16;
-        pred += 16;
-        ref += 16;
-      }
-      ref += ref_stride - width;
-    }
-  } else if (width >= 8) {
-    // Read 8 pixels two row at a time
-    assert(!(width & 7));
-    assert(!(width & 1));
-    for (i = 0; i < height; i += 2) {
-      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
-      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
-      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
-      __m128i p1 = xx_loadu_128(pred);
-
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-      comp_pred += 16;
-      pred += 16;
-      ref += 2 * ref_stride;
-    }
-  } else {
-    // Read 4 pixels four row at a time
-    assert(!(width & 3));
-    assert(!(height & 3));
-    for (i = 0; i < height; i += 4) {
-      const uint8_t *row0 = ref + 0 * ref_stride;
-      const uint8_t *row1 = ref + 1 * ref_stride;
-      const uint8_t *row2 = ref + 2 * ref_stride;
-      const uint8_t *row3 = ref + 3 * ref_stride;
-
-      __m128i p0 =
-          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
-                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
-                        row3[0], row3[1], row3[2], row3[3]);
-      __m128i p1 = xx_loadu_128(pred);
-
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-      comp_pred += 16;
-      pred += 16;
-      ref += 4 * ref_stride;
-    }
-  }
-}
-
-void aom_jnt_comp_avg_upsampled_pred_ssse3(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
-    __m128i p1 = xx_loadu_128(pred);
-
-    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
-  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
-      const uint8_t *b, int b_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
-    uint16_t fdata3[(H + 1) * W];                                        \
-    uint8_t temp2[H * W];                                                \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
-                                                                         \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
-        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
-                                                                         \
-    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
-                                jcp_param);                              \
-                                                                         \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
-  }
-
-JNT_SUBPIX_AVG_VAR(128, 128)
-JNT_SUBPIX_AVG_VAR(128, 64)
-JNT_SUBPIX_AVG_VAR(64, 128)
-JNT_SUBPIX_AVG_VAR(64, 64)
-JNT_SUBPIX_AVG_VAR(64, 32)
-JNT_SUBPIX_AVG_VAR(32, 64)
-JNT_SUBPIX_AVG_VAR(32, 32)
-JNT_SUBPIX_AVG_VAR(32, 16)
-JNT_SUBPIX_AVG_VAR(16, 32)
-JNT_SUBPIX_AVG_VAR(16, 16)
-JNT_SUBPIX_AVG_VAR(16, 8)
-JNT_SUBPIX_AVG_VAR(8, 16)
-JNT_SUBPIX_AVG_VAR(8, 8)
-JNT_SUBPIX_AVG_VAR(8, 4)
-JNT_SUBPIX_AVG_VAR(4, 8)
-JNT_SUBPIX_AVG_VAR(4, 4)
-JNT_SUBPIX_AVG_VAR(4, 16)
-JNT_SUBPIX_AVG_VAR(16, 4)
-JNT_SUBPIX_AVG_VAR(8, 32)
-JNT_SUBPIX_AVG_VAR(32, 8)
-JNT_SUBPIX_AVG_VAR(16, 64)
-JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 9d88b5e49..000000000
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ /dev/null
@@ -1,2385 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
-                                             __m128i *x2, __m128i *x3,
-                                             __m128i *d0, __m128i *d1,
-                                             __m128i *d2, __m128i *d3) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  *d0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-
-  *d1 = _mm_srli_si128(*d0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(*d0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(*d0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *d0, __m128i *d1,
-                                         __m128i *d2, __m128i *d3, __m128i *d4,
-                                         __m128i *d5, __m128i *d6,
-                                         __m128i *d7) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, ww0, ww1;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-
-  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d1 = _mm_srli_si128(ww0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(ww0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(ww0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d5 = _mm_srli_si128(ww1,
-                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d6 = _mm_srli_si128(ww1,
-                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d7 = _mm_srli_si128(ww1,
-                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *x4, __m128i *x5,
-                                         __m128i *x6, __m128i *x7, __m128i *d0,
-                                         __m128i *d1, __m128i *d2,
-                                         __m128i *d3) {
-  // input
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  // output
-  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
-  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
-  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
-  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, w2, w3, w4, w5;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d1 = _mm_srli_si128(*d0, 8);
-  *d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-  *d3 = _mm_srli_si128(*d2, 8);
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                     __m128i *x3, __m128i *x4, __m128i *x5,
-                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
-                                     __m128i *d2d3, __m128i *d4d5,
-                                     __m128i *d6d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d2d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w6 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-  w7 = _mm_unpackhi_epi16(
-      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
-  *d4d5 = _mm_unpacklo_epi32(
-      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-  *d6d7 = _mm_unpackhi_epi32(
-      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
-
-static INLINE void transpose16x8_8x16_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
-    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
-    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
-    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpacklo_epi8(*x8, *x9);
-  w9 = _mm_unpacklo_epi8(*x10, *x11);
-  w10 = _mm_unpacklo_epi8(*x12, *x13);
-  w11 = _mm_unpacklo_epi8(*x14, *x15);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0 = _mm_unpacklo_epi64(w6, w14);
-  *d1 = _mm_unpackhi_epi64(w6, w14);
-  *d2 = _mm_unpacklo_epi64(w7, w15);
-  *d3 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d4 = _mm_unpacklo_epi64(w6, w14);
-  *d5 = _mm_unpackhi_epi64(w6, w14);
-  *d6 = _mm_unpacklo_epi64(w7, w15);
-  *d7 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them  independently while flipping the second matrix horizontaly  Used for 14
-// taps filter pq pairs inverse
-static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
-                                            __m128i *x2, __m128i *x3,
-                                            __m128i *x4, __m128i *x5,
-                                            __m128i *x6, __m128i *x7,
-                                            __m128i *pq0, __m128i *pq1,
-                                            __m128i *pq2, __m128i *pq3) {
-  __m128i w10, w11, w12, w13;
-  __m128i w0, w1, w2, w3, w4, w5;
-  __m128i d0, d1, d2, d3;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w10 = _mm_unpacklo_epi8(
-      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
-  w11 = _mm_unpacklo_epi8(
-      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
-  w12 = _mm_unpacklo_epi8(
-      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
-  w13 = _mm_unpacklo_epi8(
-      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
-
-  w4 = _mm_unpackhi_epi16(
-      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpackhi_epi16(
-      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
-  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
-  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
-  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
-}
-
-static INLINE void transpose8x16_16x8_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
-    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
-    __m128i *d12d13, __m128i *d14d15) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpackhi_epi8(*x0, *x1);
-  w9 = _mm_unpackhi_epi8(*x2, *x3);
-  w10 = _mm_unpackhi_epi8(*x4, *x5);
-  w11 = _mm_unpackhi_epi8(*x6, *x7);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0d1 = _mm_unpacklo_epi64(w6, w14);
-  *d2d3 = _mm_unpackhi_epi64(w6, w14);
-  *d4d5 = _mm_unpacklo_epi64(w7, w15);
-  *d6d7 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d8d9 = _mm_unpacklo_epi64(w6, w14);
-  *d10d11 = _mm_unpackhi_epi64(w6, w14);
-  *d12d13 = _mm_unpacklo_epi64(w7, w15);
-  *d14d15 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8  independently while flipping the second matrix horizontaly. Used
-// for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                        __m128i *x3, __m128i *q0p0,
-                                        __m128i *q1p1, __m128i *q2p2,
-                                        __m128i *q3p3, __m128i *q4p4,
-                                        __m128i *q5p5, __m128i *q6p6,
-                                        __m128i *q7p7) {
-  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi8(
-      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
-  w3 = _mm_unpackhi_epi8(
-      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
-  ww2 = _mm_unpacklo_epi16(
-      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
-  ww3 = _mm_unpackhi_epi16(
-      w2,
-      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
-
-  *q7p7 = _mm_unpacklo_epi32(
-      ww0,
-      _mm_srli_si128(
-          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
-  *q6p6 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww0, 4),
-      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
-  *q5p5 = _mm_unpackhi_epi32(
-      ww0,
-      _mm_slli_si128(
-          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
-  *q4p4 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww0, 12),
-      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
-  *q3p3 = _mm_unpacklo_epi32(
-      ww1,
-      _mm_srli_si128(
-          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
-  *q2p2 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww1, 4),
-      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
-  *q1p1 = _mm_unpackhi_epi32(
-      ww1,
-      _mm_slli_si128(
-          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
-  *q0p0 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww1, 12),
-      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
-}
-
-static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
-                                          __m128i *hev, __m128i *mask,
-                                          __m128i *qs1qs0, __m128i *ps1ps0) {
-  __m128i filter, filter2filter1, work;
-  __m128i ps1ps0_work, qs1qs0_work;
-  __m128i hev1;
-  const __m128i t3t4 =
-      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
-  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
-  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
-  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
-  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
-  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
-  filter = _mm_and_si128(filter, *mask); /* & mask */
-  filter = _mm_unpacklo_epi32(filter, filter);
-
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
-  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
-  filter2filter1 =
-      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
-
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
-  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
-  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
-  filter = _mm_srai_epi16(filter, 9);          /* round */
-  filter = _mm_packs_epi16(filter, filter);
-  filter = _mm_andnot_si128(*hev, filter);
-  filter = _mm_unpacklo_epi32(filter, filter);
-
-  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
-  hev1 = _mm_srli_si128(filter2filter1, 8);
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
-  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
-  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-
-  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
-  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
-                                               __m128i *hev, __m128i *mask,
-                                               __m128i *qs1qs0,
-                                               __m128i *ps1ps0) {
-  const __m128i t3t4 =
-      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
-  __m128i filter, filter2filter1, work;
-  __m128i ps1ps0_work, qs1qs0_work;
-  __m128i hev1;
-  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
-  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
-  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
-  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
-  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
-  filter = _mm_and_si128(filter, *mask); /* & mask */
-  filter = _mm_unpacklo_epi64(filter, filter);
-
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
-  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
-  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
-  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
-  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
-
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
-  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
-  filter = _mm_unpacklo_epi8(filter, filter);
-  filter = _mm_srai_epi16(filter, 9); /* round */
-  filter = _mm_packs_epi16(filter, filter);
-  filter = _mm_andnot_si128(*hev, filter);
-
-  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
-  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
-
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
-  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
-  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
-  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
-    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
-  __m128i q1p1, q0p0, p1p0, q1q0;
-  __m128i abs_p0q0, abs_p1q1;
-  __m128i mask, flat, hev;
-  const __m128i zero = _mm_setzero_si128();
-
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  /* (abs(q1 - q0), abs(p1 - p0) */
-  flat = abs_diff(q1p1, q0p0);
-  /* abs(p1 - q1), abs(p0 - q0) */
-  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  hev = _mm_unpacklo_epi8(flat, zero);
-
-  hev = _mm_cmpgt_epi16(hev, *thresh);
-  hev = _mm_packs_epi16(hev, hev);
-  hev = _mm_unpacklo_epi32(hev, hev);
-
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
-  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
-  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
-  mask = _mm_unpacklo_epi32(mask, flat);
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
-
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
-    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
-  __m128i q1p1, q0p0, p1p0, q1q0;
-  __m128i abs_p0q0, abs_p1q1;
-  __m128i mask, hev;
-  const __m128i zero = _mm_setzero_si128();
-
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  /* (abs(q1 - q0), abs(p1 - p0) */
-  __m128i flat = abs_diff(q1p1, q0p0);
-  /* abs(p1 - q1), abs(p0 - q0) */
-  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-  hev = _mm_unpacklo_epi8(flat, zero);
-
-  hev = _mm_cmpgt_epi16(hev, *thresh);
-  hev = _mm_packs_epi16(hev, hev);
-
-  /* const int8_t mask = filter_mask2(*limit, *blimit, */
-  /*                                  p1, p0, q0, q1); */
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
-  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
-  mask = _mm_unpacklo_epi64(mask, flat);
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
-
-  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
-                                     _mm_loadl_epi64((const __m128i *)_limit));
-  __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
-  __m128i qs1qs0, ps1ps0;
-  __m128i p1, p0, q0, q1;
-
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
-
-  xx_storel_32(s - 1 * p, ps1ps0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
-  xx_storel_32(s + 0 * p, qs1qs0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
-}
-
-void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
-  __m128i p1p0, q1q0;
-  __m128i p1, p0, q0, q1;
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
-                                     _mm_loadl_epi64((const __m128i *)_limit));
-  __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
-  __m128i x0, x1, x2, x3;
-  __m128i d0, d1, d2, d3;
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
-  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
-
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
-
-  // Transpose 8x4 to 4x8
-  p1 = _mm_srli_si128(p1p0, 4);
-  q1 = _mm_srli_si128(q1q0, 4);
-
-  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
-  xx_storel_32(s + 0 * p - 2, d0);
-  xx_storel_32(s + 1 * p - 2, d1);
-  xx_storel_32(s + 2 * p - 2, d2);
-  xx_storel_32(s + 3 * p - 2, d3);
-}
-
-static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
-  xx_storel_32(s - (num + 1) * p, x);
-  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
-    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
-    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i mask, hev, flat, flat2;
-  __m128i qs0ps0, qs1ps1;
-  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
-  __m128i abs_p1p0;
-
-  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
-  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
-
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
-    __m128i fe, ff, work;
-    abs_p1p0 = abs_diff(*q1p1, *q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
-  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
-  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
-  // loopfilter done
-
-  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-  __m128i work;
-  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-    __m128i pixelFilter_p, pixelFilter_q;
-    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-    __m128i sum_p6, sum_q6;
-    __m128i sum_p3, sum_q3, res_p, res_q;
-
-    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
-    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
-    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
-    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
-    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
-    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
-    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
-    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
-    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
-    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
-    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
-    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
-    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
-    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
-
-    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-    pixelFilter_p =
-        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-    pixetFilter_p2p1p0 = _mm_add_epi16(
-        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixelFilter_p,
-                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
-                                    _mm_add_epi16(p1_16, q0_16))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixelFilter_p,
-                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
-                                    _mm_add_epi16(p0_16, q1_16))),
-        4);
-    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-    sum_p6 = _mm_add_epi16(p6_16, p6_16);
-    sum_q6 = _mm_add_epi16(q6_16, q6_16);
-    sum_p3 = _mm_add_epi16(p3_16, p3_16);
-    sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
-        4);
-    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-    // work with flat2
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-
-    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    flat = _mm_unpacklo_epi64(flat, flat);
-    *q2p2 = _mm_andnot_si128(flat, *q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
-          4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
-          4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
-          4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
-          4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      // wide flat
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-      flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
-      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
-
-      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
-
-      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
-
-      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
-
-      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
-
-      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
-    }
-  } else {
-    *q0p0 = qs0ps0;
-    *q1p1 = qs1ps1;
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
-    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
-    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i mask, hev, flat, flat2;
-  __m128i flat2_pq[6], flat_pq[3];
-  __m128i qs0ps0, qs1ps1;
-  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
-  __m128i abs_p1p0;
-
-  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  __m128i fe, ff, work;
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
-    abs_p1p0 = abs_diff(*q1p1, *q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(fe, fe);
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi32(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_unpacklo_epi32(mask, zero);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
-  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
-  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
-  // loopfilter done
-
-  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  flat = _mm_unpacklo_epi32(flat, flat);
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-    __m128i pq_16[7];
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p6;
-    __m128i sum_p3;
-
-    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
-    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
-    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
-    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
-    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
-    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
-    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
-    q0_16 = _mm_srli_si128(pq_16[0], 8);
-    q1_16 = _mm_srli_si128(pq_16[1], 8);
-    q2_16 = _mm_srli_si128(pq_16[2], 8);
-    q3_16 = _mm_srli_si128(pq_16[3], 8);
-    q4_16 = _mm_srli_si128(pq_16[4], 8);
-    q5_16 = _mm_srli_si128(pq_16[5], 8);
-
-    __m128i flat_p[3], flat_q[3];
-    __m128i flat2_p[6], flat2_q[6];
-
-    __m128i work0, work0_0, work0_1, sum_p_0;
-    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
-    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
-
-    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
-    __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
-    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
-
-    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
-    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
-    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
-
-    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
-    work0_1 = _mm_add_epi16(
-        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
-
-    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
-
-    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
-    flat_p[1] = _mm_add_epi16(sum_lp, work0);
-    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
-    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
-    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
-    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
-    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
-
-    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
-    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
-
-    flat_p[2] = _mm_add_epi16(sum_lp, work0);
-    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    flat2 = _mm_unpacklo_epi32(flat2, flat2);
-
-    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
-    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
-    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
-
-    *q2p2 = _mm_andnot_si128(flat, *q2p2);
-    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
-    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
-
-    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
-      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
-      flat2_q[0] = _mm_add_epi16(
-          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
-
-      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
-      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
-      flat2_pq[0] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-      flat2_pq[1] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
-      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
-
-      sum_p = _mm_sub_epi16(sum_p, q4_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
-      flat2_p[2] = _mm_add_epi16(sum_p, work0);
-      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[2] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q3_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
-      flat2_p[3] = _mm_add_epi16(sum_p, work0);
-      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[3] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q2_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
-      flat2_p[4] = _mm_add_epi16(sum_p, work0);
-      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[4] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q1_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
-      flat2_p[5] = _mm_add_epi16(sum_p, work0);
-      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[5] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
-
-      // wide flat
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
-      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
-
-      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
-      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
-
-      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
-      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
-
-      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
-      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
-
-      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
-      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
-
-      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
-      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
-    }
-  } else {
-    *q0p0 = qs0ps0;
-    *q1p1 = qs1ps1;
-  }
-}
-
-void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
-
-  q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
-
-  q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
-
-  q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
-
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
-
-  store_buffer_horz_8(q0p0, p, 0, s);
-  store_buffer_horz_8(q1p1, p, 1, s);
-  store_buffer_horz_8(q2p2, p, 2, s);
-  store_buffer_horz_8(q3p3, p, 3, s);
-  store_buffer_horz_8(q4p4, p, 4, s);
-  store_buffer_horz_8(q5p5, p, 5, s);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
-    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
-    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
-  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
-  __m128i ps1ps0, qs1qs0;
-
-  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-
-  {
-    // filter_mask and hev_mask
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(*p1p0, *q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-    // considering sse doesn't have unsigned elements comparison the idea is
-    // to find at least one case when X > limit, it means the corresponding
-    // mask bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = abs_diff(q2p2, q1p1);
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
-    // flat_mask
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-
-    // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
-                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
-                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_shft1 = _mm_srli_epi16(workp_a, 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
-                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_add_epi16(q1_16, q2_16);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
-                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(q2_16, q2_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
-                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
-    *q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
-    *p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
-    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
-    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
-  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
-  __m128i ps1ps0, qs1qs0;
-
-  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
-  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-  {
-    // filter_mask and hev_mask
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
-    abs_p0q0 = abs_diff(*p1p0, *q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-    // considering sse doesn't have unsigned elements comparison the idea is
-    // to find at least one case when X > limit, it means the corresponding
-    // mask bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi32(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_unpacklo_epi32(mask, zero);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = abs_diff(q2p2, q1p1);
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
-    // flat_mask
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi32(flat, flat);
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_c;
-    __m128i pq0x2_pq1, pq1_pq2;
-    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
-    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
-    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
-    q0_16 = _mm_srli_si128(pq0_16, 8);
-    q2_16 = _mm_srli_si128(pq2_16, 8);
-
-    // op1
-    pq0x2_pq1 =
-        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
-    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
-                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
-    workp_b =
-        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
-    workp_b = _mm_srli_epi16(workp_b, 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
-                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_srli_si128(pq1_pq2, 8);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
-                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(q2_16, q2_16);
-    workp_b =
-        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
-    workp_a = _mm_srli_epi16(workp_a, 3);
-
-    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
-    *q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
-    *p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
-  }
-}
-
-void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  __m128i p2, p1, p0, q0, q1, q2;
-  __m128i p1p0, q1q0;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
-
-  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
-
-  xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
-  xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-}
-
-void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i p2, p1, p0, q0, q1, q2;
-  __m128i p1p0, q1q0;
-
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
-  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                           &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *blimit, __m128i *limit, __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
-      flat_p1p0, flat_q0q1;
-  __m128i q2p2, q1p1, q0p0;
-  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_pq, opq2, pq2;
-
-  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
-  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  // filter_mask and hev_mask
-
-  // considering sse doesn't have unsigned elements comparison the idea is to
-  // find at least one case when X > limit, it means the corresponding  mask
-  // bit is set.
-  // to achieve that we find global max value of all inputs of abs(x-y) or
-  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-  // otherwise - not
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
-  abs_p1p0 = abs_diff(q1p1, q0p0);
-  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
-  abs_p0q0 = abs_diff(p1p0, q1q0);
-  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu8(flat, *thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-  // replicate for the further "merged variables" usage
-  hev = _mm_unpacklo_epi32(hev, hev);
-
-  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-  mask = _mm_unpacklo_epi32(mask, zero);
-  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  mask = _mm_max_epu8(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
-  mask = _mm_max_epu8(work, mask);
-  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
-  // flat_mask4
-  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi32(flat, flat);
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-    p3_16 = _mm_unpacklo_epi8(*p3, zero);
-    q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
-    // op2
-    workp_a =
-        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
-
-    // op1
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
-    workp_c = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
-    workp_d = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
-    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-    workp_c = _mm_add_epi16(workp_a, workp_b);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
-    workp_d = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-
-    opq2 = _mm_packus_epi16(workp_c, workp_c);
-
-    work_pq = _mm_andnot_si128(flat, q2p2);
-    pq2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_pq, pq2);
-    *q2 = _mm_srli_si128(*p2, 4);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *blimit, __m128i *limit, __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
-      flat_p1p0, flat_q0q1;
-  __m128i q2p2, q1p1, q0p0;
-  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_pq, opq2, pq2;
-
-  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
-  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  {
-    // filter_mask and hev_mask
-
-    // considering sse doesn't have unsigned elements comparison the idea is to
-    // find at least one case when X > limit, it means the corresponding  mask
-    // bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
-    // flat_mask4
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-    flat = _mm_max_epu8(abs_p1p0, flat);
-
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-    p3_16 = _mm_unpacklo_epi8(*p3, zero);
-    q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
-    // op2
-    workp_a =
-        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op1
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
-
-    work_pq = _mm_andnot_si128(flat, q2p2);
-    pq2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_pq, pq2);
-    *q2 = _mm_srli_si128(*p2, 8);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
-  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
-  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
-
-  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                      &blimit, &limit, &thresh);
-
-  xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
-  xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-  xx_storel_32(s - 3 * p, p2);
-  xx_storel_32(s + 2 * p, q2);
-}
-
-void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit0,
-                                     const unsigned char *_limit0,
-                                     const unsigned char *_thresh0,
-                                     const unsigned char *_blimit1,
-                                     const unsigned char *_limit1,
-                                     const unsigned char *_thresh1) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                                     _mm_load_si128((const __m128i *)_limit1));
-  __m128i thresh =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-
-  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
-
-  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
-
-  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                            &blimit, &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
-  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
-  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
-}
-
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-
-  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                           &blimit, &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  __m128i p1, p0, q0, q1;
-  __m128i qs1qs0, ps1ps0;
-
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-
-  __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
-  __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
-  __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
-  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
-  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
-}
-
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i p0, q0, q1, p1;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i qs1qs0, ps1ps0;
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-
-  __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
-  __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
-  __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
-  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
-  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
-
-  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
-                        &q1);
-
-  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
-  p1 = _mm_srli_si128(ps1ps0, 8);
-  q1 = _mm_srli_si128(qs1qs0, 8);
-
-  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
-                        &d5, &d6, &d7);
-
-  xx_storel_32((s - 2 + 0 * p), d0);
-  xx_storel_32((s - 2 + 1 * p), d1);
-  xx_storel_32((s - 2 + 2 * p), d2);
-  xx_storel_32((s - 2 + 3 * p), d3);
-  xx_storel_32((s - 2 + 4 * p), d4);
-  xx_storel_32((s - 2 + 5 * p), d5);
-  xx_storel_32((s - 2 + 6 * p), d6);
-  xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
-                             const unsigned char *_blimit,
-                             const unsigned char *_limit,
-                             const unsigned char *_thresh) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x2, x1, x0, x3;
-  __m128i p0, q0;
-  __m128i p1p0, q1q0;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
-  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
-                        &d7);
-
-  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 4);
-  q0 = _mm_srli_si128(q1q0, 4);
-
-  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
-  xx_storel_32(s + 0 * p - 2, d0);
-  xx_storel_32(s + 1 * p - 2, d1);
-  xx_storel_32(s + 2 * p - 2, d2);
-  xx_storel_32(s + 3 * p - 2, d3);
-}
-
-void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i p0, q0;
-  __m128i p1p0, q1q0;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-
-  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
-
-  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  d1 = _mm_srli_si128(d0d1, 8);
-  d3 = _mm_srli_si128(d2d3, 8);
-  d5 = _mm_srli_si128(d4d5, 8);
-  d7 = _mm_srli_si128(d6d7, 8);
-
-  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
-                           &blimit, &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
-                        &d6, &d7);
-
-  xx_storel_32((s - 2 + 0 * p), d0);
-  xx_storel_32((s - 2 + 1 * p), d1);
-  xx_storel_32((s - 2 + 2 * p), d2);
-  xx_storel_32((s - 2 + 3 * p), d3);
-  xx_storel_32((s - 2 + 4 * p), d4);
-  xx_storel_32((s - 2 + 5 * p), d5);
-  xx_storel_32((s - 2 + 6 * p), d6);
-  xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *_blimit,
-                             const unsigned char *_limit,
-                             const unsigned char *_thresh) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
-  __m128i p0, q0;
-  __m128i x2, x1, x0, x3;
-  __m128i q1q0, p1p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
-  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
-                        &d7);
-  // Loop filtering
-  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
-                      &blimit, &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 4);
-  q0 = _mm_srli_si128(q1q0, 4);
-
-  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
-                        &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d1, d3, d5, d7;
-  __m128i q1q0, p1p0;
-  __m128i p1, q1;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
-
-  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  d1 = _mm_srli_si128(d0d1, 8);
-  d3 = _mm_srli_si128(d2d3, 8);
-  d5 = _mm_srli_si128(d4d5, 8);
-  d7 = _mm_srli_si128(d6d7, 8);
-
-  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
-                           &q1q0, &p1p0, &blimit, &limit, &thresh);
-
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
-
-  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
-                    &d2d3, &d4d5, &d6d7);
-
-  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
-  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
-  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
-  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
-  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
-}
-
-void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
-                              const unsigned char *_blimit,
-                              const unsigned char *_limit,
-                              const unsigned char *_thresh) {
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x6, x5, x4, x3;
-  __m128i pq0, pq1, pq2, pq3;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-
-  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
-                       &q5p5, &q6p6, &q7p7);
-
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
-
-  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
-                           &q0p0, &pq0, &pq1, &pq2, &pq3);
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
-}
-
-void aom_lpf_vertical_14_dual_sse2(
-    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
-  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
-  __m128i q0, q1, q2, q3, q7;
-  __m128i p0p1, p2p3, p4p5, p6p7;
-
-  __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                                     _mm_load_si128((const __m128i *)_limit1));
-  __m128i thresh =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
-  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
-  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
-
-  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
-                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
-
-  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
-  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
-  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
-  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
-  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
-  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
-  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
-  q7 = _mm_srli_si128(d14d15, 8);
-
-  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                            &blimit, &limit, &thresh);
-
-  x0 = _mm_srli_si128(q0p0, 8);
-  x1 = _mm_srli_si128(q1p1, 8);
-  x2 = _mm_srli_si128(q2p2, 8);
-  x3 = _mm_srli_si128(q3p3, 8);
-  x4 = _mm_srli_si128(q4p4, 8);
-  x5 = _mm_srli_si128(q5p5, 8);
-  x6 = _mm_srli_si128(q6p6, 8);
-
-  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
-                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
-                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
-  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
-  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
-  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
-}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
deleted file mode 100644
index 8970fe7dd..000000000
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
-#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-
-static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
-                                            __m128i *x2, __m128i *x3,
-                                            __m128i *x4, __m128i *x5,
-                                            __m128i *d0, __m128i *d1,
-                                            __m128i *d2, __m128i *d3,
-                                            __m128i *d4, __m128i *d5) {
-  __m128i w0, w1, w2, w3, w4, w5, ww0;
-
-  // 00 01 02 03 04 05 xx xx
-  // 10 11 12 13 14 15 xx xx
-  // 20 21 22 23 24 25 xx xx
-  // 30 31 32 33 34 35 xx xx
-  // 40 41 42 43 44 45 xx xx
-  // 50 51 52 53 54 55 xx xx
-
-  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
-  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
-  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
-  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
-  *d1 = _mm_unpackhi_epi64(ww0,
-                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
-
-  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
-  *d2 = _mm_unpacklo_epi64(ww0,
-                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
-
-  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
-  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
-  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
-
-  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
-
-  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
-  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
-  *d5 = _mm_unpackhi_epi64(ww0,
-                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
-}
-
-static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
-                                                    __m128i *x2, __m128i *x3,
-                                                    __m128i *d0, __m128i *d1,
-                                                    __m128i *d2, __m128i *d3) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i w0, w1, ww0, ww1;
-
-  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
-  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
-  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
-
-  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
-  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
-  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
-  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
-}
-
-static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
-                                                     __m128i *x2, __m128i *x3,
-                                                     __m128i *d4, __m128i *d5,
-                                                     __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, ww2, ww3;
-  __m128i zero = _mm_setzero_si128();
-
-  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
-  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
-
-  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
-  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
-
-  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
-  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
-  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
-  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
-}
-
-// here in and out pointers (x and d) should be different! we don't store their
-// values inside
-static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
-                                                __m128i *x2, __m128i *x3,
-                                                __m128i *d0, __m128i *d1,
-                                                __m128i *d2, __m128i *d3,
-                                                __m128i *d4, __m128i *d5,
-                                                __m128i *d6, __m128i *d7) {
-  // input
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // output
-  // 00 10 20 30 xx xx xx xx
-  // 01 11 21 31 xx xx xx xx
-  // 02 12 22 32 xx xx xx xx
-  // 03 13 23 33 xx xx xx xx
-  // 04 14 24 34 xx xx xx xx
-  // 05 15 25 35 xx xx xx xx
-  // 06 16 26 36 xx xx xx xx
-  // 07 17 27 37 xx xx xx xx
-  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
-  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
-}
-
-static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
-                                                __m128i *x2, __m128i *x3,
-                                                __m128i *x4, __m128i *x5,
-                                                __m128i *x6, __m128i *x7,
-                                                __m128i *d0, __m128i *d1,
-                                                __m128i *d2, __m128i *d3) {
-  __m128i w0, w1, w2, w3, ww0, ww1;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5 50 51 52 53 54 55 56 57
-  // x6 60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-
-  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
-  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
-  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
-  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
-  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
-
-  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
-  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
-
-  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
-
-  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
-  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
-}
-
-static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
-                                                 __m128i *x2, __m128i *x3,
-                                                 __m128i *x4, __m128i *x5,
-                                                 __m128i *x6, __m128i *x7,
-                                                 __m128i *d4, __m128i *d5,
-                                                 __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, w2, w3, ww0, ww1;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5 50 51 52 53 54 55 56 57
-  // x6 60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
-  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
-  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
-  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
-
-  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
-  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
-
-  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
-  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
-
-  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
-  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
-}
-
-// here in and out pointers (x and d) should be different! we don't store their
-// values inside
-static INLINE void highbd_transpose8x8_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
-    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
-    __m128i *d7) {
-  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
-  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
-}
-
-// here in and out pointers (x and d arrays) should be different! we don't store
-// their values inside
-static INLINE void highbd_transpose8x16_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
-    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
-    __m128i *d7) {
-  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
-                           d5, d6, d7);
-  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
-                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
-                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
-}
-
-#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
deleted file mode 100644
index 584b5e7e3..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
-
-static INLINE unsigned int masked_sad32xh_avx2(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height) {
-  int x, y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_scale =
-      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 32) {
-      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
-      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
-      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
-      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
-      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
-
-      // Calculate 16 predicted pixels.
-      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-      // is 64 * 255, so we have plenty of space to add rounding constants.
-      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
-      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
-      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
-      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
-
-      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
-      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
-      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
-      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
-
-      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
-      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
-  res = _mm256_shuffle_epi32(res, 0xd8);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
-}
-
-static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
-  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
-  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
-  __m256i a = _mm256_castsi128_si256(a0);
-  return _mm256_inserti128_si256(a, a1, 1);
-}
-
-static INLINE unsigned int masked_sad16xh_avx2(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
-  int y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_scale =
-      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
-    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
-    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
-
-    // Calculate 16 predicted pixels.
-    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-    // is 64 * 255, so we have plenty of space to add rounding constants.
-    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
-    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
-    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
-    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
-
-    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
-    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
-    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
-    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
-
-    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
-    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
-
-    src_ptr += src_stride << 1;
-    a_ptr += a_stride << 1;
-    b_ptr += b_stride << 1;
-    m_ptr += m_stride << 1;
-  }
-  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
-  res = _mm256_shuffle_epi32(res, 0xd8);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int aom_masked_sad_avx2(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
-    int invert_mask, int m, int n) {
-  unsigned int sad;
-  if (!invert_mask) {
-    switch (m) {
-      case 4:
-        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
-                                      second_pred, m, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
-                                      second_pred, m, msk, msk_stride, n);
-        break;
-      case 16:
-        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
-                                  m, msk, msk_stride, n);
-        break;
-      default:
-        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
-                                  m, msk, msk_stride, m, n);
-        break;
-    }
-  } else {
-    switch (m) {
-      case 4:
-        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
-                                      ref_stride, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
-                                      ref_stride, msk, msk_stride, n);
-        break;
-      case 16:
-        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
-                                  ref_stride, msk, msk_stride, n);
-        break;
-      default:
-        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
-                                  ref_stride, msk, msk_stride, m, n);
-        break;
-    }
-  }
-  return sad;
-}
-
-#define MASKSADMXN_AVX2(m, n)                                                 \
-  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
-                               msk, msk_stride, invert_mask, m, n);           \
-  }
-
-MASKSADMXN_AVX2(4, 4)
-MASKSADMXN_AVX2(4, 8)
-MASKSADMXN_AVX2(8, 4)
-MASKSADMXN_AVX2(8, 8)
-MASKSADMXN_AVX2(8, 16)
-MASKSADMXN_AVX2(16, 8)
-MASKSADMXN_AVX2(16, 16)
-MASKSADMXN_AVX2(16, 32)
-MASKSADMXN_AVX2(32, 16)
-MASKSADMXN_AVX2(32, 32)
-MASKSADMXN_AVX2(32, 64)
-MASKSADMXN_AVX2(64, 32)
-MASKSADMXN_AVX2(64, 64)
-MASKSADMXN_AVX2(64, 128)
-MASKSADMXN_AVX2(128, 64)
-MASKSADMXN_AVX2(128, 128)
-MASKSADMXN_AVX2(4, 16)
-MASKSADMXN_AVX2(16, 4)
-MASKSADMXN_AVX2(8, 32)
-MASKSADMXN_AVX2(32, 8)
-MASKSADMXN_AVX2(16, 64)
-MASKSADMXN_AVX2(64, 16)
-
-static INLINE unsigned int highbd_masked_sad8xh_avx2(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
-    // Zero-extend mask to 16 bits
-    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(m_ptr)),
-        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
-    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
-
-    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
-    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
-    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
-    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
-                               AOM_BLEND_A64_ROUND_BITS);
-
-    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
-    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
-    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
-    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
-                               AOM_BLEND_A64_ROUND_BITS);
-
-    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
-    // so it is safe to do signed saturation here.
-    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
-    // There is no 16-bit SAD instruction, so we have to synthesize
-    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
-    // and accumulating them at the end
-    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
-    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
-
-    src_ptr += src_stride << 1;
-    a_ptr += a_stride << 1;
-    b_ptr += b_stride << 1;
-    m_ptr += m_stride << 1;
-  }
-  // At this point, we have four 32-bit partial SADs stored in 'res'.
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int highbd_masked_sad16xh_avx2(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int x, y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
-      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
-      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
-      // Zero-extend mask to 16 bits
-      const __m256i m =
-          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
-      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
-
-      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
-      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
-      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
-      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
-                                 AOM_BLEND_A64_ROUND_BITS);
-
-      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
-      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
-      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
-      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
-                                 AOM_BLEND_A64_ROUND_BITS);
-
-      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
-      // so it is safe to do signed saturation here.
-      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
-      // There is no 16-bit SAD instruction, so we have to synthesize
-      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
-      // and accumulating them at the end
-      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
-      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have four 32-bit partial SADs stored in 'res'.
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int aom_highbd_masked_sad_avx2(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
-    int invert_mask, int m, int n) {
-  unsigned int sad;
-  if (!invert_mask) {
-    switch (m) {
-      case 4:
-        sad =
-            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
-                                           second_pred, m, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
-                                        second_pred, m, msk, msk_stride, n);
-        break;
-      default:
-        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
-                                         second_pred, m, msk, msk_stride, m, n);
-        break;
-    }
-  } else {
-    switch (m) {
-      case 4:
-        sad =
-            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
-                                           ref_stride, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
-                                        ref_stride, msk, msk_stride, n);
-        break;
-      default:
-        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
-                                         ref_stride, msk, msk_stride, m, n);
-        break;
-    }
-  }
-  return sad;
-}
-
-#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
-  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
-      int msk_stride, int invert_mask) {                                  \
-    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
-                                      second_pred8, msk, msk_stride,      \
-                                      invert_mask, m, n);                 \
-  }
-
-HIGHBD_MASKSADMXN_AVX2(4, 4);
-HIGHBD_MASKSADMXN_AVX2(4, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 16);
-HIGHBD_MASKSADMXN_AVX2(32, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 32);
-HIGHBD_MASKSADMXN_AVX2(64, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 128);
-HIGHBD_MASKSADMXN_AVX2(128, 64);
-HIGHBD_MASKSADMXN_AVX2(128, 128);
-HIGHBD_MASKSADMXN_AVX2(4, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
deleted file mode 100644
index 493f9bd8f..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
-
-// For width a multiple of 16
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height);
-
-#define MASKSADMXN_SSSE3(m, n)                                                \
-  unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    if (!invert_mask)                                                         \
-      return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred,  \
-                              m, msk, msk_stride, m, n);                      \
-    else                                                                      \
-      return masked_sad_ssse3(src, src_stride, second_pred, m, ref,           \
-                              ref_stride, msk, msk_stride, m, n);             \
-  }
-
-#define MASKSAD8XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    if (!invert_mask)                                                         \
-      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
-                                     second_pred, 8, msk, msk_stride, n);     \
-    else                                                                      \
-      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
-                                     ref_stride, msk, msk_stride, n);         \
-  }
-
-#define MASKSAD4XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    if (!invert_mask)                                                         \
-      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
-                                     second_pred, 4, msk, msk_stride, n);     \
-    else                                                                      \
-      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
-                                     ref_stride, msk, msk_stride, n);         \
-  }
-
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-MASKSAD4XN_SSSE3(16)
-MASKSADMXN_SSSE3(16, 4)
-MASKSAD8XN_SSSE3(32)
-MASKSADMXN_SSSE3(32, 8)
-MASKSADMXN_SSSE3(16, 64)
-MASKSADMXN_SSSE3(64, 16)
-
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height) {
-  int x, y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-      // Calculate 16 predicted pixels.
-      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-      // is 64 * 255, so we have plenty of space to add rounding constants.
-      const __m128i data_l = _mm_unpacklo_epi8(a, b);
-      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
-      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
-      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i data_r = _mm_unpackhi_epi8(a, b);
-      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
-      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
-      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
-      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
-  int32_t sad =
-      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
-}
-
-unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height) {
-  int y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
-    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
-    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
-    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
-    const __m128i m =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
-                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
-    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
-    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
-    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
-    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
-    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
-    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
-    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
-    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-
-    src_ptr += src_stride * 2;
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  int32_t sad =
-      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
-}
-
-unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height) {
-  int y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (y = 0; y < height; y += 2) {
-    // Load two rows at a time, this seems to be a bit faster
-    // than four rows at a time in this case.
-    const __m128i src = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
-        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
-    const __m128i a =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
-    const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
-    const __m128i m =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
-    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-    const __m128i data = _mm_unpacklo_epi8(a, b);
-    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
-    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
-    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
-    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-
-    src_ptr += src_stride * 2;
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  // At this point, the SAD is stored in lane 0 of 'res'
-  int32_t sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
-}
-
-// For width a multiple of 8
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
-      int msk_stride, int invert_mask) {                                      \
-    if (!invert_mask)                                                         \
-      return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride,      \
-                                     second_pred8, m, msk, msk_stride, m, n); \
-    else                                                                      \
-      return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
-                                     ref_stride, msk, msk_stride, m, n);      \
-  }
-
-#define HIGHBD_MASKSAD4XN_SSSE3(n)                                             \
-  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                             \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
-      int msk_stride, int invert_mask) {                                       \
-    if (!invert_mask)                                                          \
-      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
-                                            ref_stride, second_pred8, 4, msk,  \
-                                            msk_stride, n);                    \
-    else                                                                       \
-      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
-                                            ref8, ref_stride, msk, msk_stride, \
-                                            n);                                \
-  }
-
-HIGHBD_MASKSADMXN_SSSE3(128, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 128)
-HIGHBD_MASKSADMXN_SSSE3(64, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 64)
-HIGHBD_MASKSADMXN_SSSE3(32, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 32)
-HIGHBD_MASKSADMXN_SSSE3(16, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 16)
-HIGHBD_MASKSADMXN_SSSE3(8, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 4)
-HIGHBD_MASKSAD4XN_SSSE3(8)
-HIGHBD_MASKSAD4XN_SSSE3(4)
-HIGHBD_MASKSAD4XN_SSSE3(16)
-HIGHBD_MASKSADMXN_SSSE3(16, 4)
-HIGHBD_MASKSADMXN_SSSE3(8, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 8)
-HIGHBD_MASKSADMXN_SSSE3(16, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 16)
-
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int x, y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 8) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      // Zero-extend mask to 16 bits
-      const __m128i m = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
-      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-      const __m128i data_l = _mm_unpacklo_epi16(a, b);
-      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i data_r = _mm_unpackhi_epi16(a, b);
-      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
-      // so it is safe to do signed saturation here.
-      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
-      // There is no 16-bit SAD instruction, so we have to synthesize
-      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
-      // and accumulating them at the end
-      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
-      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have four 32-bit partial SADs stored in 'res'.
-  res = _mm_hadd_epi32(res, res);
-  res = _mm_hadd_epi32(res, res);
-  int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
-}
-
-unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
-                                            const uint8_t *a8, int a_stride,
-                                            const uint8_t *b8, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
-                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
-    const __m128i b =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
-                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
-    // Zero-extend mask to 16 bits
-    const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
-        _mm_setzero_si128());
-    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-    const __m128i data_l = _mm_unpacklo_epi16(a, b);
-    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i data_r = _mm_unpackhi_epi16(a, b);
-    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
-    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
-    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
-
-    src_ptr += src_stride * 2;
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  res = _mm_hadd_epi32(res, res);
-  res = _mm_hadd_epi32(res, res);
-  int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
-}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
deleted file mode 100644
index cffbd9672..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
-#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
-
-unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height);
-
-unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height);
-
-unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
-                                            const uint8_t *a8, int a_stride,
-                                            const uint8_t *b8, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int height);
-
-#endif  // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
deleted file mode 100644
index d7dbefd7d..000000000
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ /dev/null
@@ -1,1064 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-
-// For width a multiple of 16
-static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
-                            int yoffset, uint8_t *dst, int w, int h);
-
-static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h);
-
-static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h);
-
-// For width a multiple of 16
-static void masked_variance(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *a_ptr, int a_stride,
-                            const uint8_t *b_ptr, int b_stride,
-                            const uint8_t *m_ptr, int m_stride, int width,
-                            int height, unsigned int *sse, int *sum_);
-
-static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_);
-
-static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_);
-
-#define MASK_SUBPIX_VAR_SSSE3(W, H)                                   \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(        \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,   \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
-      const uint8_t *msk, int msk_stride, int invert_mask,            \
-      unsigned int *sse) {                                            \
-    int sum;                                                          \
-    uint8_t temp[(H + 1) * W];                                        \
-                                                                      \
-    bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);   \
-                                                                      \
-    if (!invert_mask)                                                 \
-      masked_variance(ref, ref_stride, temp, W, second_pred, W, msk,  \
-                      msk_stride, W, H, sse, &sum);                   \
-    else                                                              \
-      masked_variance(ref, ref_stride, second_pred, W, temp, W, msk,  \
-                      msk_stride, W, H, sse, &sum);                   \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));         \
-  }
-
-#define MASK_SUBPIX_VAR8XH_SSSE3(H)                                           \
-  unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    int sum;                                                                  \
-    uint8_t temp[(H + 1) * 8];                                                \
-                                                                              \
-    bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H);           \
-                                                                              \
-    if (!invert_mask)                                                         \
-      masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    else                                                                      \
-      masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H));                 \
-  }
-
-#define MASK_SUBPIX_VAR4XH_SSSE3(H)                                           \
-  unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    int sum;                                                                  \
-    uint8_t temp[(H + 1) * 4];                                                \
-                                                                              \
-    bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);           \
-                                                                              \
-    if (!invert_mask)                                                         \
-      masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    else                                                                      \
-      masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
-  }
-
-MASK_SUBPIX_VAR_SSSE3(128, 128)
-MASK_SUBPIX_VAR_SSSE3(128, 64)
-MASK_SUBPIX_VAR_SSSE3(64, 128)
-MASK_SUBPIX_VAR_SSSE3(64, 64)
-MASK_SUBPIX_VAR_SSSE3(64, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 64)
-MASK_SUBPIX_VAR_SSSE3(32, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 32)
-MASK_SUBPIX_VAR_SSSE3(16, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 8)
-MASK_SUBPIX_VAR8XH_SSSE3(16)
-MASK_SUBPIX_VAR8XH_SSSE3(8)
-MASK_SUBPIX_VAR8XH_SSSE3(4)
-MASK_SUBPIX_VAR4XH_SSSE3(8)
-MASK_SUBPIX_VAR4XH_SSSE3(4)
-MASK_SUBPIX_VAR4XH_SSSE3(16)
-MASK_SUBPIX_VAR_SSSE3(16, 4)
-MASK_SUBPIX_VAR8XH_SSSE3(32)
-MASK_SUBPIX_VAR_SSSE3(32, 8)
-MASK_SUBPIX_VAR_SSSE3(64, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 64)
-
-static INLINE __m128i filter_block(const __m128i a, const __m128i b,
-                                   const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi8(a, b);
-  v0 = _mm_maddubs_epi16(v0, filter);
-  v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpackhi_epi8(a, b);
-  v1 = _mm_maddubs_epi16(v1, filter);
-  v1 = xx_roundn_epu16(v1, FILTER_BITS);
-
-  return _mm_packus_epi16(v0, v1);
-}
-
-static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
-                            int yoffset, uint8_t *dst, int w, int h) {
-  int i, j;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 16) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        _mm_storeu_si128((__m128i *)&b[j], x);
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else if (xoffset == 4) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 16) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
-        __m128i z = _mm_alignr_epi8(y, x, 1);
-        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else {
-    uint8_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 16) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
-        const __m128i z = _mm_alignr_epi8(y, x, 1);
-        const __m128i res = filter_block(x, z, hfilter_vec);
-        _mm_storeu_si128((__m128i *)&b[j], res);
-      }
-
-      src += src_stride;
-      b += w;
-    }
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
-      }
-      dst += w;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        const __m128i res = filter_block(x, y, vfilter_vec);
-        _mm_storeu_si128((__m128i *)&dst[j], res);
-      }
-
-      dst += w;
-    }
-  }
-}
-
-static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
-                                         const __m128i a1, const __m128i b1,
-                                         const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi8(a0, b0);
-  v0 = _mm_maddubs_epi16(v0, filter);
-  v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpacklo_epi8(a1, b1);
-  v1 = _mm_maddubs_epi16(v1, filter);
-  v1 = xx_roundn_epu16(v1, FILTER_BITS);
-
-  return _mm_packus_epi16(v0, v1);
-}
-
-static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h) {
-  int i;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)src);
-      _mm_storel_epi64((__m128i *)b, x);
-      src += src_stride;
-      b += 8;
-    }
-  } else if (xoffset == 4) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadu_si128((__m128i *)src);
-      __m128i z = _mm_srli_si128(x, 1);
-      _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
-      src += src_stride;
-      b += 8;
-    }
-  } else {
-    uint8_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
-      const __m128i z0 = _mm_srli_si128(x0, 1);
-      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
-      const __m128i z1 = _mm_srli_si128(x1, 1);
-      const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
-      _mm_storeu_si128((__m128i *)b, res);
-
-      src += src_stride * 2;
-      b += 16;
-    }
-    // Handle i = h separately
-    const __m128i x0 = _mm_loadu_si128((__m128i *)src);
-    const __m128i z0 = _mm_srli_si128(x0, 1);
-
-    __m128i v0 = _mm_unpacklo_epi8(x0, z0);
-    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
-    v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-    _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
-      dst += 8;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
-      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
-      const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
-      _mm_storeu_si128((__m128i *)dst, res);
-
-      dst += 16;
-    }
-  }
-}
-
-static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h) {
-  int i;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = xx_loadl_32((__m128i *)src);
-      xx_storel_32((__m128i *)b, x);
-      src += src_stride;
-      b += 4;
-    }
-  } else if (xoffset == 4) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)src);
-      __m128i z = _mm_srli_si128(x, 1);
-      xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
-      src += src_stride;
-      b += 4;
-    }
-  } else {
-    uint8_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
-    for (i = 0; i < h; i += 4) {
-      const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
-      const __m128i z0 = _mm_srli_si128(x0, 1);
-      const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
-      const __m128i z1 = _mm_srli_si128(x1, 1);
-      const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
-      const __m128i z2 = _mm_srli_si128(x2, 1);
-      const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
-      const __m128i z3 = _mm_srli_si128(x3, 1);
-
-      const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
-      const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
-      const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
-      const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
-      _mm_storeu_si128((__m128i *)b, res);
-
-      src += src_stride * 4;
-      b += 16;
-    }
-    // Handle i = h separately
-    const __m128i x = _mm_loadl_epi64((__m128i *)src);
-    const __m128i z = _mm_srli_si128(x, 1);
-
-    __m128i v0 = _mm_unpacklo_epi8(x, z);
-    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
-    v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-    xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      __m128i x = xx_loadl_32((__m128i *)dst);
-      __m128i y = xx_loadl_32((__m128i *)&dst[4]);
-      xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
-      dst += 4;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
-    for (i = 0; i < h; i += 4) {
-      const __m128i a = xx_loadl_32((__m128i *)dst);
-      const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
-      const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
-      const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
-      const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
-
-      const __m128i a0 = _mm_unpacklo_epi32(a, b);
-      const __m128i b0 = _mm_unpacklo_epi32(b, c);
-      const __m128i a1 = _mm_unpacklo_epi32(c, d);
-      const __m128i b1 = _mm_unpacklo_epi32(d, e);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
-      _mm_storeu_si128((__m128i *)dst, res);
-
-      dst += 16;
-    }
-  }
-}
-
-static INLINE void accumulate_block(const __m128i src, const __m128i a,
-                                    const __m128i b, const __m128i m,
-                                    __m128i *sum, __m128i *sum_sq) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-  // Calculate 16 predicted pixels.
-  // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-  // is 64 * 255, so we have plenty of space to add rounding constants.
-  const __m128i data_l = _mm_unpacklo_epi8(a, b);
-  const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
-  __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
-  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i data_r = _mm_unpackhi_epi8(a, b);
-  const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
-  __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
-  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i src_l = _mm_unpacklo_epi8(src, zero);
-  const __m128i src_r = _mm_unpackhi_epi8(src, zero);
-  const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
-  const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
-
-  // Update partial sums and partial sums of squares
-  *sum =
-      _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
-  *sum_sq =
-      _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
-                                           _mm_madd_epi16(diff_r, diff_r)));
-}
-
-static void masked_variance(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *a_ptr, int a_stride,
-                            const uint8_t *b_ptr, int b_stride,
-                            const uint8_t *m_ptr, int m_stride, int width,
-                            int height, unsigned int *sse, int *sum_) {
-  int x, y;
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      accumulate_block(src, a, b, m, &sum, &sum_sq);
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, sum);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_) {
-  int y;
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
-  for (y = 0; y < height; y += 2) {
-    __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
-    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
-                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
-
-    src_ptr += src_stride * 2;
-    a_ptr += 16;
-    b_ptr += 16;
-    m_ptr += m_stride * 2;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, sum);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_) {
-  int y;
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
-  for (y = 0; y < height; y += 4) {
-    // Load four rows at a time
-    __m128i src =
-        _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
-                       *(uint32_t *)&src_ptr[src_stride * 2],
-                       *(uint32_t *)&src_ptr[src_stride * 3]);
-    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
-    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m = _mm_setr_epi32(
-        *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
-        *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
-
-    src_ptr += src_stride * 4;
-    a_ptr += 16;
-    b_ptr += 16;
-    m_ptr += m_stride * 4;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, sum);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-// For width a multiple of 8
-static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
-                                   int xoffset, int yoffset, uint16_t *dst,
-                                   int w, int h);
-
-static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
-                                      int xoffset, int yoffset, uint16_t *dst,
-                                      int h);
-
-// For width a multiple of 8
-static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
-                                   const uint16_t *a_ptr, int a_stride,
-                                   const uint16_t *b_ptr, int b_stride,
-                                   const uint8_t *m_ptr, int m_stride,
-                                   int width, int height, uint64_t *sse,
-                                   int *sum_);
-
-static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
-                                      const uint16_t *a_ptr,
-                                      const uint16_t *b_ptr,
-                                      const uint8_t *m_ptr, int m_stride,
-                                      int height, int *sse, int *sum_);
-
-#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H)                                  \
-  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3(     \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    uint64_t sse64;                                                         \
-    int sum;                                                                \
-    uint16_t temp[(H + 1) * W];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    else                                                                    \
-      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    *sse = (uint32_t)sse64;                                                 \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
-  }                                                                         \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(    \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    uint64_t sse64;                                                         \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * W];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    else                                                                    \
-      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
-    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }                                                                         \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    uint64_t sse64;                                                         \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * W];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    else                                                                    \
-      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
-    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }
-
-#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
-  unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3(         \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    int sse_;                                                               \
-    int sum;                                                                \
-    uint16_t temp[(H + 1) * 4];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    else                                                                    \
-      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    *sse = (uint32_t)sse_;                                                  \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
-  }                                                                         \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3(        \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    int sse_;                                                               \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * 4];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    else                                                                    \
-      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
-    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }                                                                         \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    int sse_;                                                               \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * 4];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    else                                                                    \
-      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
-    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }
-
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
-
-static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
-                                          const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi16(a, b);
-  v0 = _mm_madd_epi16(v0, filter);
-  v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpackhi_epi16(a, b);
-  v1 = _mm_madd_epi16(v1, filter);
-  v1 = xx_roundn_epu32(v1, FILTER_BITS);
-
-  return _mm_packs_epi32(v0, v1);
-}
-
-static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
-                                   int xoffset, int yoffset, uint16_t *dst,
-                                   int w, int h) {
-  int i, j;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        _mm_storeu_si128((__m128i *)&b[j], x);
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else if (xoffset == 4) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
-        __m128i z = _mm_alignr_epi8(y, x, 2);
-        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else {
-    uint16_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
-        const __m128i z = _mm_alignr_epi8(y, x, 2);
-        const __m128i res = highbd_filter_block(x, z, hfilter_vec);
-        _mm_storeu_si128((__m128i *)&b[j], res);
-      }
-
-      src += src_stride;
-      b += w;
-    }
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
-      }
-      dst += w;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        const __m128i res = highbd_filter_block(x, y, vfilter_vec);
-        _mm_storeu_si128((__m128i *)&dst[j], res);
-      }
-
-      dst += w;
-    }
-  }
-}
-
-static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
-                                                const __m128i b0,
-                                                const __m128i a1,
-                                                const __m128i b1,
-                                                const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi16(a0, b0);
-  v0 = _mm_madd_epi16(v0, filter);
-  v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpacklo_epi16(a1, b1);
-  v1 = _mm_madd_epi16(v1, filter);
-  v1 = xx_roundn_epu32(v1, FILTER_BITS);
-
-  return _mm_packs_epi32(v0, v1);
-}
-
-static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
-                                      int xoffset, int yoffset, uint16_t *dst,
-                                      int h) {
-  int i;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)src);
-      _mm_storel_epi64((__m128i *)b, x);
-      src += src_stride;
-      b += 4;
-    }
-  } else if (xoffset == 4) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadu_si128((__m128i *)src);
-      __m128i z = _mm_srli_si128(x, 2);
-      _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
-      src += src_stride;
-      b += 4;
-    }
-  } else {
-    uint16_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
-      const __m128i z0 = _mm_srli_si128(x0, 2);
-      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
-      const __m128i z1 = _mm_srli_si128(x1, 2);
-      const __m128i res =
-          highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
-      _mm_storeu_si128((__m128i *)b, res);
-
-      src += src_stride * 2;
-      b += 8;
-    }
-    // Process i = h separately
-    __m128i x = _mm_loadu_si128((__m128i *)src);
-    __m128i z = _mm_srli_si128(x, 2);
-
-    __m128i v0 = _mm_unpacklo_epi16(x, z);
-    v0 = _mm_madd_epi16(v0, hfilter_vec);
-    v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
-    _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
-      dst += 4;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
-      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
-      const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
-      _mm_storeu_si128((__m128i *)dst, res);
-
-      dst += 8;
-    }
-  }
-}
-
-static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
-                                   const uint16_t *a_ptr, int a_stride,
-                                   const uint16_t *b_ptr, int b_stride,
-                                   const uint8_t *m_ptr, int m_stride,
-                                   int width, int height, uint64_t *sse,
-                                   int *sum_) {
-  int x, y;
-  // Note on bit widths:
-  // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
-  // so this can be kept as four 32-bit values.
-  // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
-  // so this must be stored as two 64-bit values.
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i zero = _mm_setzero_si128();
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 8) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m =
-          _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
-      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-      // Calculate 8 predicted pixels.
-      const __m128i data_l = _mm_unpacklo_epi16(a, b);
-      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i data_r = _mm_unpackhi_epi16(a, b);
-      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i src_l = _mm_unpacklo_epi16(src, zero);
-      const __m128i src_r = _mm_unpackhi_epi16(src, zero);
-      __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
-      __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
-
-      // Update partial sums and partial sums of squares
-      sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
-      // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
-      // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
-      // So we can re-pack into 16-bit fields and use _mm_madd_epi16
-      // to calculate the squares and partially sum them.
-      const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
-      const __m128i prod = _mm_madd_epi16(tmp, tmp);
-      // Then we want to sign-extend to 64 bits and accumulate
-      const __m128i sign = _mm_srai_epi32(prod, 31);
-      const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
-      const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
-      sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, zero);
-  sum = _mm_hadd_epi32(sum, zero);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
-  _mm_storel_epi64((__m128i *)sse, sum_sq);
-}
-
-static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
-                                      const uint16_t *a_ptr,
-                                      const uint16_t *b_ptr,
-                                      const uint8_t *m_ptr, int m_stride,
-                                      int height, int *sse, int *sum_) {
-  int y;
-  // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
-  // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
-  // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
-  // So we can safely pack sum_sq into 32-bit fields, which is slightly more
-  // convenient.
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i zero = _mm_setzero_si128();
-
-  for (y = 0; y < height; y += 2) {
-    __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
-    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
-        zero);
-    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-    const __m128i data_l = _mm_unpacklo_epi16(a, b);
-    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i data_r = _mm_unpackhi_epi16(a, b);
-    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i src_l = _mm_unpacklo_epi16(src, zero);
-    const __m128i src_r = _mm_unpackhi_epi16(src, zero);
-    __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
-    __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
-
-    // Update partial sums and partial sums of squares
-    sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
-    const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
-    const __m128i prod = _mm_madd_epi16(tmp, tmp);
-    sum_sq = _mm_add_epi32(sum_sq, prod);
-
-    src_ptr += src_stride * 2;
-    a_ptr += 8;
-    b_ptr += 8;
-    m_ptr += m_stride * 2;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, zero);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                              int width, int height, const uint8_t *ref,
-                              int ref_stride, const uint8_t *mask,
-                              int mask_stride, int invert_mask) {
-  const uint8_t *src0 = invert_mask ? pred : ref;
-  const uint8_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  assert(height % 2 == 0);
-  int i = 0;
-  if (width == 8) {
-    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
-                           mask, mask_stride);
-  } else if (width == 16) {
-    do {
-      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
-      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
-                              mask + mask_stride, comp_pred + width);
-      comp_pred += (width << 1);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      i += 2;
-    } while (i < height);
-  } else {  // width == 32
-    assert(width == 32);
-    do {
-      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
-      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
-      comp_pred += (width);
-      src0 += (stride0);
-      src1 += (stride1);
-      mask += (mask_stride);
-      i += 1;
-    } while (i < height);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
deleted file mode 100644
index 4faa098ac..000000000
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
-#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
-
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-
-static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
-                                           const uint8_t *src1,
-                                           const uint8_t *mask, uint8_t *dst) {
-  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i round_offset =
-      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
-  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
-  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
-
-  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
-
-  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
-  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
-  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
-  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
-
-  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
-  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
-
-  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
-  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
-  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
-}
-
-static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
-                                          const uint8_t *src0, int stride0,
-                                          const uint8_t *src1, int stride1,
-                                          const uint8_t *mask,
-                                          int mask_stride) {
-  int i = 0;
-  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i round_offset =
-      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    // odd line A
-    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
-    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
-    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
-    // even line B
-    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
-    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
-    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
-        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
-
-    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
-    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
-
-    const __m128i ma = _mm_sub_epi8(alpha_max, a);
-    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
-    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
-
-    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
-    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
-    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
-    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
-    const __m128i round = _mm_packus_epi16(roundA, roundB);
-    // comp_pred's stride == width == 8
-    _mm_store_si128((__m128i *)(comp_pred), round);
-    comp_pred += (8 << 1);
-    src0 += (stride0 << 1);
-    src1 += (stride1 << 1);
-    mask += (mask_stride << 1);
-    i += 2;
-  } while (i < height);
-}
-
-#endif  // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
deleted file mode 100644
index 6c821673e..000000000
--- a/third_party/aom/aom_dsp/x86/mem_sse2.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
-#define AOM_AOM_DSP_X86_MEM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
-  return _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
-}
-
-static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
-                                                  const int byte_stride) {
-  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
-}
-
-static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
-                                                  const int byte_stride) {
-  __m128i dst;
-  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
-  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
-  return dst;
-}
-
-#endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
deleted file mode 100644
index 5181e444c..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
-#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
-
-#include <smmintrin.h>
-
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    unsigned int *const sse, int *const sum,
-                                    const int h) {
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
-    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
-    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
-
-    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
-    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
deleted file mode 100644
index 48486c6c4..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  return _mm_cvtsi128_si32(v_d);
-}
-
-static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
-  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
-#if ARCH_X86_64
-  return _mm_cvtsi128_si64(v_q);
-#else
-  {
-    int64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_q);
-    return tmp;
-  }
-#endif
-}
-
-static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
-  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
-  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
-  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
-  return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
-}
-
-// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
-  const __m128i v_tmp_d =
-      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
deleted file mode 100644
index 2aa2a0555..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
-                                            const int pre_stride,
-                                            const int32_t *wsrc,
-                                            const int32_t *mask,
-                                            const int height) {
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
-  do {
-    const __m128i v_p_b_0 = xx_loadl_32(pre);
-    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
-    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
-    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
-    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
-    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
-    n += 8;
-    pre += pre_stride << 1;
-  } while (n < 8 * (height >> 1));
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int obmc_sad_w8n_avx2(
-    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p0_b = xx_loadl_64(pre + n);
-    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
-    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
-    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
-    n += 8;
-
-    if ((n & (width - 1)) == 0) pre += pre_step;
-  } while (n < width * height);
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define OBMCSADWXH(w, h)                                          \
-  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *msk) {                                       \
-    if (w == 4) {                                                 \
-      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
-    } else {                                                      \
-      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
-    }                                                             \
-  }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
-                                                const int pre_stride,
-                                                const int32_t *wsrc,
-                                                const int32_t *mask,
-                                                const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  do {
-    const __m128i v_p_w_0 = xx_loadl_64(pre);
-    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
-    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
-    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
-    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
-    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
-    n += 8;
-
-    pre += pre_stride << 1;
-  } while (n < 8 * (height >> 1));
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
-    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
-    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
-    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define HBD_OBMCSADWXH(w, h)                                           \
-  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
-      const int32_t *mask) {                                           \
-    if (w == 4) {                                                      \
-      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
-    } else {                                                           \
-      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
-    }                                                                  \
-  }
-
-HBD_OBMCSADWXH(128, 128)
-HBD_OBMCSADWXH(128, 64)
-HBD_OBMCSADWXH(64, 128)
-HBD_OBMCSADWXH(64, 64)
-HBD_OBMCSADWXH(64, 32)
-HBD_OBMCSADWXH(32, 64)
-HBD_OBMCSADWXH(32, 32)
-HBD_OBMCSADWXH(32, 16)
-HBD_OBMCSADWXH(16, 32)
-HBD_OBMCSADWXH(16, 16)
-HBD_OBMCSADWXH(16, 8)
-HBD_OBMCSADWXH(8, 16)
-HBD_OBMCSADWXH(8, 8)
-HBD_OBMCSADWXH(8, 4)
-HBD_OBMCSADWXH(4, 8)
-HBD_OBMCSADWXH(4, 4)
-HBD_OBMCSADWXH(4, 16)
-HBD_OBMCSADWXH(16, 4)
-HBD_OBMCSADWXH(8, 32)
-HBD_OBMCSADWXH(32, 8)
-HBD_OBMCSADWXH(16, 64)
-HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
deleted file mode 100644
index 0338a8c77..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
-                                                 const int pre_stride,
-                                                 const int32_t *wsrc,
-                                                 const int32_t *mask,
-                                                 const int height) {
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  do {
-    const __m128i v_p_b = xx_loadl_32(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
-    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_b = xx_loadl_32(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
-    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
-    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
-    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define OBMCSADWXH(w, h)                                       \
-  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
-      const int32_t *msk) {                                    \
-    if (w == 4) {                                              \
-      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
-    } else {                                                   \
-      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
-    }                                                          \
-  }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
-                                                     const int pre_stride,
-                                                     const int32_t *wsrc,
-                                                     const int32_t *mask,
-                                                     const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  do {
-    const __m128i v_p_w = xx_loadl_64(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_w = xx_loadl_64(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
-    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
-    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
-    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define HBD_OBMCSADWXH(w, h)                                      \
-  unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *mask) {                                      \
-    if (w == 4) {                                                 \
-      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
-    } else {                                                      \
-      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
-    }                                                             \
-  }
-
-HBD_OBMCSADWXH(128, 128)
-HBD_OBMCSADWXH(128, 64)
-HBD_OBMCSADWXH(64, 128)
-HBD_OBMCSADWXH(64, 64)
-HBD_OBMCSADWXH(64, 32)
-HBD_OBMCSADWXH(32, 64)
-HBD_OBMCSADWXH(32, 32)
-HBD_OBMCSADWXH(32, 16)
-HBD_OBMCSADWXH(16, 32)
-HBD_OBMCSADWXH(16, 16)
-HBD_OBMCSADWXH(16, 8)
-HBD_OBMCSADWXH(8, 16)
-HBD_OBMCSADWXH(8, 8)
-HBD_OBMCSADWXH(8, 4)
-HBD_OBMCSADWXH(4, 8)
-HBD_OBMCSADWXH(4, 4)
-HBD_OBMCSADWXH(4, 16)
-HBD_OBMCSADWXH(16, 4)
-HBD_OBMCSADWXH(8, 32)
-HBD_OBMCSADWXH(32, 8)
-HBD_OBMCSADWXH(16, 64)
-HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
deleted file mode 100644
index bfec0e8a8..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
-                                     const int32_t *wsrc, const int32_t *mask,
-                                     unsigned int *const sse, int *const sum,
-                                     const int w, const int h) {
-  int n = 0, width, height = h;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  __m128i v_d;
-  const uint8_t *pre_temp;
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-  do {
-    width = w;
-    pre_temp = pre;
-    do {
-      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
-      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
-      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
-      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-
-      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-      // boundaries. We use pmaddwd, as it has lower latency on Haswell
-      // than pmulld but produces the same result with these inputs.
-      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
-      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-
-      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
-      const __m256i v_tmp_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
-      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
-      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
-      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
-
-      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
-      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-      pre_temp += 8;
-      n += 8;
-      width -= 8;
-    } while (width > 0);
-    pre += pre_stride;
-    height -= 1;
-  } while (height > 0);
-  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  *sum = _mm_cvtsi128_si32(v_d);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
-}
-
-static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
-                                      const int32_t *wsrc, const int32_t *mask,
-                                      unsigned int *const sse, int *const sum,
-                                      const int w, const int h) {
-  int n = 0, width, height = h;
-  __m256i v_d;
-  __m128i res0;
-  const uint8_t *pre_temp;
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-
-  assert(w >= 16);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-  do {
-    width = w;
-    pre_temp = pre;
-    do {
-      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
-      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
-      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
-      const __m256i v_m1_d =
-          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
-      const __m256i v_w1_d =
-          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
-
-      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
-
-      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
-
-      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
-
-      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
-      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
-
-      const __m256i v_tmp0_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
-      const __m256i v_tmp1_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
-
-      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
-      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
-
-      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
-      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
-      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
-      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
-
-      pre_temp += 16;
-      n += 16;
-      width -= 16;
-    } while (width > 0);
-    pre += pre_stride;
-    height -= 1;
-  } while (height > 0);
-
-  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
-  v_d = _mm256_hadd_epi32(v_d, v_d);
-  res0 = _mm256_castsi256_si128(v_d);
-  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
-  *sum = _mm_cvtsi128_si32(res0);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
-}
-
-#define OBMCVARWXH(W, H)                                                \
-  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
-      const int32_t *mask, unsigned int *sse) {                         \
-    int sum;                                                            \
-    if (W == 4) {                                                       \
-      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
-    } else if (W == 8) {                                                \
-      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
-    } else {                                                            \
-      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
-    }                                                                   \
-                                                                        \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
-  }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
deleted file mode 100644
index 72eda0e57..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
-                                     const int32_t *wsrc, const int32_t *mask,
-                                     unsigned int *const sse, int *const sum,
-                                     const int w, const int h) {
-  const int pre_step = pre_stride - w;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_b = xx_loadl_32(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
-    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
-    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
-    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
-    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
-    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 8;
-
-    if (n % w == 0) pre += pre_step;
-  } while (n < w * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#define OBMCVARWXH(W, H)                                               \
-  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
-      const int32_t *mask, unsigned int *sse) {                        \
-    int sum;                                                           \
-    if (W == 4) {                                                      \
-      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
-    } else {                                                           \
-      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
-    }                                                                  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
-  }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
-
-#include "config/aom_dsp_rtcd.h"
-
-#define OBMC_SUBPIX_VAR(W, H)                                                \
-  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint8_t temp2[H * W];                                                    \
-                                                                             \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
-  }
-
-OBMC_SUBPIX_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 64)
-OBMC_SUBPIX_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 32)
-OBMC_SUBPIX_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 16)
-OBMC_SUBPIX_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 8)
-OBMC_SUBPIX_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 4)
-OBMC_SUBPIX_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_SUBPIX_VAR(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void hbd_obmc_variance_w4(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p_w = xx_loadl_64(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
-    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-static INLINE void hbd_obmc_variance_w8n(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
-    const int h) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - w;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_w = xx_loadl_64(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
-    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
-    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
-    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
-    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
-    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 8;
-
-    if (n % w == 0) pre += pre_step;
-  } while (n < w * h);
-
-  *sum += xx_hsum_epi32_si64(v_sum_d);
-  *sse += xx_hsum_epi32_si64(v_sse_d);
-}
-
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, int w, int h,
-                                        unsigned int *sse, int *sum) {
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-  if (w == 4) {
-    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
-    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
-  }
-  *sum = (int)sum64;
-  *sse = (unsigned int)sse64;
-}
-
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask, int w, int h,
-                                           unsigned int *sse, int *sum) {
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-  if (w == 4) {
-    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else if (w < 128 || h < 128) {
-    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
-  } else {
-    assert(w == 128 && h == 128);
-
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
-                            64);
-      pre8 += 64 * pre_stride;
-      wsrc += 64 * w;
-      mask += 64 * w;
-      h -= 64;
-    } while (h > 0);
-  }
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask, int w, int h,
-                                           unsigned int *sse, int *sum) {
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-  int max_pel_allowed_per_ovf = 512;
-  if (w == 4) {
-    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else if (w * h <= max_pel_allowed_per_ovf) {
-    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
-  } else {
-    int h_per_ovf = max_pel_allowed_per_ovf / w;
-
-    assert(max_pel_allowed_per_ovf % w == 0);
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
-                            h_per_ovf);
-      pre8 += h_per_ovf * pre_stride;
-      wsrc += h_per_ovf * w;
-      mask += h_per_ovf * w;
-      h -= h_per_ovf;
-    } while (h > 0);
-  }
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HBD_OBMCVARWXH(W, H)                                               \
-  unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-HBD_OBMCVARWXH(128, 128)
-HBD_OBMCVARWXH(128, 64)
-HBD_OBMCVARWXH(64, 128)
-HBD_OBMCVARWXH(64, 64)
-HBD_OBMCVARWXH(64, 32)
-HBD_OBMCVARWXH(32, 64)
-HBD_OBMCVARWXH(32, 32)
-HBD_OBMCVARWXH(32, 16)
-HBD_OBMCVARWXH(16, 32)
-HBD_OBMCVARWXH(16, 16)
-HBD_OBMCVARWXH(16, 8)
-HBD_OBMCVARWXH(8, 16)
-HBD_OBMCVARWXH(8, 8)
-HBD_OBMCVARWXH(8, 4)
-HBD_OBMCVARWXH(4, 8)
-HBD_OBMCVARWXH(4, 4)
-HBD_OBMCVARWXH(4, 16)
-HBD_OBMCVARWXH(16, 4)
-HBD_OBMCVARWXH(8, 32)
-HBD_OBMCVARWXH(32, 8)
-HBD_OBMCVARWXH(16, 64)
-HBD_OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 216a0bd8f..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,435 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r3, roundmp
-  movifnidn                       r4, quantmp
-  mov                             r6, dequantmp
-  mov                             r5, shiftmp
-  mova                            m1, [r3]              ; m1 = round
-  mova                            m2, [r4]              ; m2 = quant
-  mova                            m3, [r6]              ; m3 = dequant
-  mova                            m4, [r5]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movifnidn                 dequantq, dequantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [dequantq]           ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]            ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-
-
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
deleted file mode 100644
index d3de6e24d..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <xmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/quantize_x86.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
-void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
-  const __m128i zero = _mm_setzero_si128();
-  int index = 16;
-
-  __m128i zbin, round, quant, dequant, shift;
-  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
-  __m128i qcoeff0, qcoeff1;
-  __m128i cmp_mask0, cmp_mask1;
-  __m128i eob, eob0;
-
-  (void)scan_ptr;
-
-  // Setup global values.
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
-
-  // Do DC and first 15 AC.
-  coeff0 = load_coefficients(coeff_ptr);
-  coeff1 = load_coefficients(coeff_ptr + 8);
-
-  // Poor man's abs().
-  coeff0_sign = _mm_srai_epi16(coeff0, 15);
-  coeff1_sign = _mm_srai_epi16(coeff1, 15);
-  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
-  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
-
-  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-
-  calculate_qcoeff(&qcoeff0, round, quant, shift);
-
-  round = _mm_unpackhi_epi64(round, round);
-  quant = _mm_unpackhi_epi64(quant, quant);
-  shift = _mm_unpackhi_epi64(shift, shift);
-
-  calculate_qcoeff(&qcoeff1, round, quant, shift);
-
-  // Reinsert signs
-  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
-  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
-
-  // Mask out zbin threshold coeffs
-  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-  store_coefficients(qcoeff0, qcoeff_ptr);
-  store_coefficients(qcoeff1, qcoeff_ptr + 8);
-
-  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-  dequant = _mm_unpackhi_epi64(dequant, dequant);
-  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-  store_coefficients(coeff0, dqcoeff_ptr);
-  store_coefficients(coeff1, dqcoeff_ptr + 8);
-
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
-
-  // AC only loop.
-  while (index < n_coeffs) {
-    coeff0 = load_coefficients(coeff_ptr + index);
-    coeff1 = load_coefficients(coeff_ptr + index + 8);
-
-    coeff0_sign = _mm_srai_epi16(coeff0, 15);
-    coeff1_sign = _mm_srai_epi16(coeff1, 15);
-    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
-
-    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-
-    calculate_qcoeff(&qcoeff0, round, quant, shift);
-    calculate_qcoeff(&qcoeff1, round, quant, shift);
-
-    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
-
-    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-    store_coefficients(qcoeff0, qcoeff_ptr + index);
-    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
-
-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-    store_coefficients(coeff0, dqcoeff_ptr + index);
-    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
-
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
-    eob = _mm_max_epi16(eob, eob0);
-
-    index += 16;
-  }
-
-  *eob_ptr = accumulate_eob(eob);
-}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index 39d4ca674..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movifnidn                 dequantq, dequantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [dequantq]           ; m3 = dequant
-  mov                             r2, shiftmp
-  psubw                           m0, [GLOBAL(pw_1)]
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                            m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pxor                           m11, m11
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5
-
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*4+ 0], m5
-  mova        [qcoeffq+ncoeffq*4+16], m5
-  mova        [qcoeffq+ncoeffq*4+32], m5
-  mova        [qcoeffq+ncoeffq*4+48], m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m5
-  mova       [dqcoeffq+ncoeffq*4+16], m5
-  mova       [dqcoeffq+ncoeffq*4+32], m5
-  mova       [dqcoeffq+ncoeffq*4+48], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
deleted file mode 100644
index 4eed7dd29..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_x86.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom/aom_integer.h"
-
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
-                                 const int16_t *round_ptr, __m128i *round,
-                                 const int16_t *quant_ptr, __m128i *quant,
-                                 const int16_t *dequant_ptr, __m128i *dequant,
-                                 const int16_t *shift_ptr, __m128i *shift) {
-  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
-  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
-  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)shift_ptr);
-}
-
-// With ssse3 and later abs() and sign() are preferred.
-static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
-  a = _mm_xor_si128(a, sign);
-  return _mm_sub_epi16(a, sign);
-}
-
-static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
-                                    const __m128i quant, const __m128i shift) {
-  __m128i tmp, qcoeff;
-  qcoeff = _mm_adds_epi16(*coeff, round);
-  tmp = _mm_mulhi_epi16(qcoeff, quant);
-  qcoeff = _mm_add_epi16(tmp, qcoeff);
-  *coeff = _mm_mulhi_epi16(qcoeff, shift);
-}
-
-static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
-  return _mm_mullo_epi16(qcoeff, dequant);
-}
-
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
-static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
-                                   const __m128i zbin_mask0,
-                                   const __m128i zbin_mask1,
-                                   const int16_t *scan_ptr, const int index,
-                                   const __m128i zero) {
-  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
-  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
-  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
-  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
-  __m128i eob0, eob1;
-  // Add one to convert from indices to counts
-  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
-  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
-  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
-  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
-  return _mm_max_epi16(eob0, eob1);
-}
-
-static INLINE int16_t accumulate_eob(__m128i eob) {
-  __m128i eob_shuffled;
-  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-  eob = _mm_max_epi16(eob, eob_shuffled);
-  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-  eob = _mm_max_epi16(eob, eob_shuffled);
-  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-  eob = _mm_max_epi16(eob, eob_shuffled);
-  return _mm_extract_epi16(eob, 1);
-}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
deleted file mode 100644
index f662b62b1..000000000
--- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>  // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32; i++) {
-    // load src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
-  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
-  __m256i ref3_reg, ref3next_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64; i++) {
-    // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
-    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
-    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
-    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
-
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 5;
-  rf[0] += ref_stride << 5;
-  rf[1] += ref_stride << 5;
-  rf[2] += ref_stride << 5;
-  rf[3] += ref_stride << 5;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
-void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-  unsigned int half_width = 32;
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
deleted file mode 100644
index 55a856985..000000000
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ /dev/null
@@ -1,257 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_4x2x4 5-6 0
-  movd                  m0, [srcq +%2]
-%if %1 == 1
-  movd                  m6, [ref1q+%3]
-  movd                  m4, [ref2q+%3]
-  movd                  m7, [ref3q+%3]
-  movd                  m5, [ref4q+%3]
-  movd                  m1, [srcq +%4]
-  movd                  m2, [ref1q+%5]
-  punpckldq             m0, m1
-  punpckldq             m6, m2
-  movd                  m1, [ref2q+%5]
-  movd                  m2, [ref3q+%5]
-  movd                  m3, [ref4q+%5]
-  punpckldq             m4, m1
-  punpckldq             m7, m2
-  punpckldq             m5, m3
-  movlhps               m0, m0
-  movlhps               m6, m4
-  movlhps               m7, m5
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movd                  m1, [ref1q+%3]
-  movd                  m5, [ref1q+%5]
-  movd                  m2, [ref2q+%3]
-  movd                  m4, [ref2q+%5]
-  punpckldq             m1, m5
-  punpckldq             m2, m4
-  movd                  m3, [ref3q+%3]
-  movd                  m5, [ref3q+%5]
-  punpckldq             m3, m5
-  movd                  m4, [ref4q+%3]
-  movd                  m5, [ref4q+%5]
-  punpckldq             m4, m5
-  movd                  m5, [srcq +%4]
-  punpckldq             m0, m5
-  movlhps               m0, m0
-  movlhps               m1, m2
-  movlhps               m3, m4
-  psadbw                m1, m0
-  psadbw                m3, m0
-  paddd                 m6, m1
-  paddd                 m7, m3
-%endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_8x2x4 5-6 0
-  movh                  m0, [srcq +%2]
-%if %1 == 1
-  movh                  m4, [ref1q+%3]
-  movh                  m5, [ref2q+%3]
-  movh                  m6, [ref3q+%3]
-  movh                  m7, [ref4q+%3]
-  movhps                m0, [srcq +%4]
-  movhps                m4, [ref1q+%5]
-  movhps                m5, [ref2q+%5]
-  movhps                m6, [ref3q+%5]
-  movhps                m7, [ref4q+%5]
-  psadbw                m4, m0
-  psadbw                m5, m0
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movh                  m1, [ref1q+%3]
-  movh                  m2, [ref2q+%3]
-  movh                  m3, [ref3q+%3]
-  movhps                m0, [srcq +%4]
-  movhps                m1, [ref1q+%5]
-  movhps                m2, [ref2q+%5]
-  movhps                m3, [ref3q+%5]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movh                  m1, [ref4q+%3]
-  movhps                m1, [ref4q+%5]
-  paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_16x2x4 5-6 0
-  ; 1st 16 px
-  mova                  m0, [srcq +%2]
-%if %1 == 1
-  movu                  m4, [ref1q+%3]
-  movu                  m5, [ref2q+%3]
-  movu                  m6, [ref3q+%3]
-  movu                  m7, [ref4q+%3]
-  psadbw                m4, m0
-  psadbw                m5, m0
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movu                  m1, [ref1q+%3]
-  movu                  m2, [ref2q+%3]
-  movu                  m3, [ref3q+%3]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movu                  m1, [ref4q+%3]
-  paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endif
-
-  ; 2nd 16 px
-  mova                  m0, [srcq +%4]
-  movu                  m1, [ref1q+%5]
-  movu                  m2, [ref2q+%5]
-  movu                  m3, [ref3q+%5]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movu                  m1, [ref4q+%5]
-  paddd                 m5, m2
-  paddd                 m6, m3
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
-%endif
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endmacro
-
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_32x2x4 5-6 0
-  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
-  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
-%endmacro
-
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_64x2x4 5-6 0
-  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
-  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
-%endmacro
-
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_128x2x4 5-6 0
-  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
-  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
-%endmacro
-
-; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
-;                         uint8_t *ref[4], int ref_stride,
-;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
-%if UNIX64
-cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
-%else
-cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
-%endif
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  mov                ref2q, [ref1q+gprsize*1]
-  mov                ref3q, [ref1q+gprsize*2]
-  mov                ref4q, [ref1q+gprsize*3]
-  mov                ref1q, [ref1q+gprsize*0]
-
-  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
-
-%if %1 > 4
-  pslldq                m5, 4
-  pslldq                m7, 4
-  por                   m4, m5
-  por                   m6, m7
-  mova                  m5, m4
-  mova                  m7, m6
-  punpcklqdq            m4, m6
-  punpckhqdq            m5, m7
-  movifnidn             r4, r4mp
-  paddd                 m4, m5
-  movu                [r4], m4
-  RET
-%else
-  movifnidn             r4, r4mp
-  pshufd            m6, m6, 0x08
-  pshufd            m7, m7, 0x08
-  movq              [r4+0], m6
-  movq              [r4+8], m7
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-SADNXN4D 128, 128
-SADNXN4D 128, 64
-SADNXN4D 64,  128
-SADNXN4D 64, 64
-SADNXN4D 64, 32
-SADNXN4D 32, 64
-SADNXN4D 32, 32
-SADNXN4D 32, 16
-SADNXN4D 16, 32
-SADNXN4D 16, 16
-SADNXN4D 16,  8
-SADNXN4D  8, 16
-SADNXN4D  8,  8
-SADNXN4D  8,  4
-SADNXN4D  4,  8
-SADNXN4D  4,  4
-SADNXN4D  4, 16
-SADNXN4D 16,  4
-SADNXN4D  8, 32
-SADNXN4D 32,  8
-SADNXN4D 16, 64
-SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
deleted file mode 100644
index a50dba64a..000000000
--- a/third_party/aom/aom_dsp/x86/sad_avx2.c
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-#define FSAD64_H(h)                                                           \
-  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSAD32_H(h)                                                           \
-  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
-
-#define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
-
-/* clang-format off */
-FSAD64
-FSAD32
-/* clang-format on */
-
-#undef FSAD64
-#undef FSAD32
-#undef FSAD64_H
-#undef FSAD32_H
-
-#define FSADAVG64_H(h)                                                        \
-  unsigned int aom_sad64x##h##_avg_avx2(                                      \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      ref1_reg = _mm256_avg_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
-      ref2_reg = _mm256_avg_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-      second_pred += 64;                                                      \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSADAVG32_H(h)                                                        \
-  unsigned int aom_sad32x##h##_avg_avx2(                                      \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      ref1_reg = _mm256_avg_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
-      ref2_reg = _mm256_avg_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-      second_pred += 64;                                                      \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
-
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
-
-/* clang-format off */
-FSADAVG64
-FSADAVG32
-/* clang-format on */
-
-#undef FSADAVG64
-#undef FSADAVG32
-#undef FSADAVG64_H
-#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
deleted file mode 100644
index b506d4663..000000000
--- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
+++ /dev/null
@@ -1,1038 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_ports/mem.h"
-
-// SAD
-static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
-  // input 8 32-bit summation
-  __m128i lo128, hi128;
-  __m256i u = _mm256_srli_si256(*v, 8);
-  u = _mm256_add_epi32(u, *v);
-
-  // 4 32-bit summation
-  hi128 = _mm256_extracti128_si256(u, 1);
-  lo128 = _mm256_castsi256_si128(u);
-  lo128 = _mm_add_epi32(hi128, lo128);
-
-  // 2 32-bit summation
-  hi128 = _mm_srli_si128(lo128, 4);
-  lo128 = _mm_add_epi32(lo128, hi128);
-
-  return (unsigned int)_mm_cvtsi128_si32(lo128);
-}
-
-unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-
-  // first 4 rows
-  __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  __m256i u0 = _mm256_sub_epi16(s0, r0);
-  __m256i u1 = _mm256_sub_epi16(s1, r1);
-  __m256i u2 = _mm256_sub_epi16(s2, r2);
-  __m256i u3 = _mm256_sub_epi16(s3, r3);
-  __m256i zero = _mm256_setzero_si256();
-  __m256i sum0, sum1;
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum0 = _mm256_add_epi16(u0, u1);
-  sum0 = _mm256_add_epi16(sum0, u2);
-  sum0 = _mm256_add_epi16(sum0, u3);
-
-  // second 4 rows
-  src_ptr += src_stride << 2;
-  ref_ptr += ref_stride << 2;
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  u0 = _mm256_sub_epi16(s0, r0);
-  u1 = _mm256_sub_epi16(s1, r1);
-  u2 = _mm256_sub_epi16(s2, r2);
-  u3 = _mm256_sub_epi16(s3, r3);
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum1 = _mm256_add_epi16(u0, u1);
-  sum1 = _mm256_add_epi16(sum1, u2);
-  sum1 = _mm256_add_epi16(sum1, u3);
-
-  // find out the SAD
-  s0 = _mm256_unpacklo_epi16(sum0, zero);
-  s1 = _mm256_unpackhi_epi16(sum0, zero);
-  r0 = _mm256_unpacklo_epi16(sum1, zero);
-  r1 = _mm256_unpackhi_epi16(sum1, zero);
-  s0 = _mm256_add_epi32(s0, s1);
-  r0 = _mm256_add_epi32(r0, r1);
-  sum0 = _mm256_add_epi32(s0, r0);
-  // 8 32-bit summation
-
-  return (unsigned int)get_sad_from_mm256_epi32(&sum0);
-}
-
-unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
-  __m256i sum0;
-  __m256i sum = _mm256_setzero_si256();
-  const __m256i zero = _mm256_setzero_si256();
-  int row = 0;
-
-  // Loop for every 4 rows
-  while (row < 16) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-    u0 = _mm256_sub_epi16(s0, r0);
-    u1 = _mm256_sub_epi16(s1, r1);
-    u2 = _mm256_sub_epi16(s2, r2);
-    u3 = _mm256_sub_epi16(s3, r3);
-
-    u0 = _mm256_abs_epi16(u0);
-    u1 = _mm256_abs_epi16(u1);
-    u2 = _mm256_abs_epi16(u2);
-    u3 = _mm256_abs_epi16(u3);
-
-    sum0 = _mm256_add_epi16(u0, u1);
-    sum0 = _mm256_add_epi16(sum0, u2);
-    sum0 = _mm256_add_epi16(sum0, u3);
-
-    s0 = _mm256_unpacklo_epi16(sum0, zero);
-    s1 = _mm256_unpackhi_epi16(sum0, zero);
-    sum = _mm256_add_epi32(sum, s0);
-    sum = _mm256_add_epi32(sum, s1);
-    // 8 32-bit summation
-
-    row += 4;
-    src_ptr += src_stride << 2;
-    ref_ptr += ref_stride << 2;
-  }
-  return get_sad_from_mm256_epi32(&sum);
-}
-
-static void sad32x4(const uint16_t *src_ptr, int src_stride,
-                    const uint16_t *ref_ptr, int ref_stride,
-                    const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
-  int row_sections = 0;
-
-  while (row_sections < 2) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-
-    if (sec_ptr) {
-      r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-      r1 = _mm256_avg_epu16(
-          r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-      r2 = _mm256_avg_epu16(
-          r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-      r3 = _mm256_avg_epu16(
-          r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    }
-    s0 = _mm256_sub_epi16(s0, r0);
-    s1 = _mm256_sub_epi16(s1, r1);
-    s2 = _mm256_sub_epi16(s2, r2);
-    s3 = _mm256_sub_epi16(s3, r3);
-
-    s0 = _mm256_abs_epi16(s0);
-    s1 = _mm256_abs_epi16(s1);
-    s2 = _mm256_abs_epi16(s2);
-    s3 = _mm256_abs_epi16(s3);
-
-    s0 = _mm256_add_epi16(s0, s1);
-    s0 = _mm256_add_epi16(s0, s2);
-    s0 = _mm256_add_epi16(s0, s3);
-
-    r0 = _mm256_unpacklo_epi16(s0, zero);
-    r1 = _mm256_unpackhi_epi16(s0, zero);
-
-    r0 = _mm256_add_epi32(r0, r1);
-    *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-
-    row_sections += 1;
-    src_ptr += src_stride << 1;
-    ref_ptr += ref_stride << 1;
-    if (sec_ptr) sec_ptr += 32 << 1;
-  }
-}
-
-unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  const int left_shift = 2;
-  int row_section = 0;
-
-  while (row_section < 4) {
-    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-static void sad64x2(const uint16_t *src_ptr, int src_stride,
-                    const uint16_t *ref_ptr, int ref_stride,
-                    const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
-  }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  const int left_shift = 1;
-  int row_section = 0;
-
-  while (row_section < 16) {
-    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
-                     const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
-  }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  int row = 0;
-  while (row < 64) {
-    sad128x1(srcp, refp, NULL, &sad);
-    srcp += src_stride;
-    refp += ref_stride;
-    row += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
-                                        const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
-static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
-                           const uint16_t *ref_ptr, int ref_stride,
-                           const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
-
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  if (sec_ptr) {
-    r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r1 = _mm256_avg_epu16(r1,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r2 = _mm256_avg_epu16(r2,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r3 = _mm256_avg_epu16(r3,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-  }
-
-  s0 = _mm256_sub_epi16(s0, r0);
-  s1 = _mm256_sub_epi16(s1, r1);
-  s2 = _mm256_sub_epi16(s2, r2);
-  s3 = _mm256_sub_epi16(s3, r3);
-
-  s0 = _mm256_abs_epi16(s0);
-  s1 = _mm256_abs_epi16(s1);
-  s2 = _mm256_abs_epi16(s2);
-  s3 = _mm256_abs_epi16(s3);
-
-  s0 = _mm256_add_epi16(s0, s1);
-  s0 = _mm256_add_epi16(s0, s2);
-  s0 = _mm256_add_epi16(s0, s3);
-
-  r0 = _mm256_unpacklo_epi16(s0, zero);
-  r1 = _mm256_unpackhi_epi16(s0, zero);
-
-  r0 = _mm256_add_epi32(r0, r1);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-}
-
-unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
-                                         const uint8_t *ref, int ref_stride,
-                                         const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-
-  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-
-  // Next 4 rows
-  srcp += src_stride << 2;
-  refp += ref_stride << 2;
-  secp += 64;
-  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 3;
-  uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
-                                             second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 16 << left_shift;
-  sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
-                                     second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 4;
-  uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 16 << left_shift;
-  sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-  const int left_shift = 2;
-  int row_section = 0;
-
-  while (row_section < 4) {
-    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    secp += 32 << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 4;
-  uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 32 << left_shift;
-  sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 5;
-  uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 32 << left_shift;
-  sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-  const int left_shift = 1;
-  int row_section = 0;
-
-  while (row_section < 16) {
-    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    secp += 64 << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 5;
-  uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 64 << left_shift;
-  sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
-                                           const uint8_t *ref, int ref_stride,
-                                           const uint8_t *second_pred) {
-  const int left_shift = 6;
-  uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 64 << left_shift;
-  sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
-                                           const uint8_t *ref, int ref_stride,
-                                           const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-  int row = 0;
-  while (row < 64) {
-    sad128x1(srcp, refp, secp, &sad);
-    srcp += src_stride;
-    refp += ref_stride;
-    secp += 16 << 3;
-    row += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
-                                            const uint8_t *ref, int ref_stride,
-                                            const uint8_t *second_pred) {
-  unsigned int sum;
-  const int left_shift = 6;
-
-  sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 128 << left_shift;
-  sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                       second_pred);
-  return sum;
-}
-
-// SAD 4D
-// Combine 4 __m256i vectors to uint32_t result[4]
-static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
-                                               uint32_t *res) {
-  __m256i u0, u1, u2, u3;
-  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
-  __m128i sad;
-
-  // 8 32-bit summation
-  u0 = _mm256_srli_si256(v[0], 4);
-  u1 = _mm256_srli_si256(v[1], 4);
-  u2 = _mm256_srli_si256(v[2], 4);
-  u3 = _mm256_srli_si256(v[3], 4);
-
-  u0 = _mm256_add_epi32(u0, v[0]);
-  u1 = _mm256_add_epi32(u1, v[1]);
-  u2 = _mm256_add_epi32(u2, v[2]);
-  u3 = _mm256_add_epi32(u3, v[3]);
-
-  u0 = _mm256_and_si256(u0, mask);
-  u1 = _mm256_and_si256(u1, mask);
-  u2 = _mm256_and_si256(u2, mask);
-  u3 = _mm256_and_si256(u3, mask);
-  // 4 32-bit summation, evenly positioned
-
-  u1 = _mm256_slli_si256(u1, 4);
-  u3 = _mm256_slli_si256(u3, 4);
-
-  u0 = _mm256_or_si256(u0, u1);
-  u2 = _mm256_or_si256(u2, u3);
-  // 8 32-bit summation, interleaved
-
-  u1 = _mm256_unpacklo_epi64(u0, u2);
-  u3 = _mm256_unpackhi_epi64(u0, u2);
-
-  u0 = _mm256_add_epi32(u1, u3);
-  sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
-                      _mm256_castsi256_si128(u0));
-  _mm_storeu_si128((__m128i *)res, sad);
-}
-
-static void convert_pointers(const uint8_t *const ref8[],
-                             const uint16_t *ref[]) {
-  ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
-  ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
-  ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
-  ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
-}
-
-static void init_sad(__m256i *s) {
-  s[0] = _mm256_setzero_si256();
-  s[1] = _mm256_setzero_si256();
-  s[2] = _mm256_setzero_si256();
-  s[3] = _mm256_setzero_si256();
-}
-
-void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
-                                const uint8_t *const ref_array[],
-                                int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-    srcp += src_stride << shift_for_4_rows;
-    refp[i] += ref_stride << shift_for_4_rows;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first8rows[4];
-  uint32_t second8rows[4];
-  const uint8_t *ref[4];
-  const int shift_for_8_rows = 3;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
-  src += src_stride << shift_for_8_rows;
-  ref[0] += ref_stride << shift_for_8_rows;
-  ref[1] += ref_stride << shift_for_8_rows;
-  ref[2] += ref_stride << shift_for_8_rows;
-  ref[3] += ref_stride << shift_for_8_rows;
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
-  sad_array[0] = first8rows[0] + second8rows[0];
-  sad_array[1] = first8rows[1] + second8rows[1];
-  sad_array[2] = first8rows[2] + second8rows[2];
-  sad_array[3] = first8rows[3] + second8rows[3];
-}
-
-void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 4) {
-      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-      srcp += src_stride << shift_for_4_rows;
-      refp[i] += ref_stride << shift_for_4_rows;
-      rows_section++;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_rows = 1;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 16) {
-      sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
-      srcp += src_stride << shift_for_rows;
-      refp[i] += ref_stride << shift_for_rows;
-      rows_section++;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 64) {
-      sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
-      srcp += src_stride;
-      refp[i] += ref_stride;
-      rows_section++;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref_array[],
-                                   int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
deleted file mode 100644
index c6fd62c9e..000000000
--- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  __m256i s1, s2, r1, r2;
-  __m256i sum = _mm256_setzero_si256();
-  __m128i sum_i128;
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
-    r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
-    s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
-    s2 = _mm256_sad_epu8(
-        r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
-    sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
-    ref_ptr += ref_stride << 1;
-    src_ptr += src_stride << 1;
-  }
-
-  sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
-  sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
-                           _mm256_castsi256_si128(sum));
-  return _mm_cvtsi128_si32(sum_i128);
-}
-
-static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  unsigned int half_width = 32;
-  uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += half_width;
-  ref_ptr += half_width;
-  sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += src_stride << 5;
-  ref_ptr += ref_stride << 5;
-  sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride) {
-  unsigned int half_width = 64;
-  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += half_width;
-  ref_ptr += half_width;
-  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride) {
-  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
-                                 const uint8_t *ref_ptr, int ref_stride) {
-  uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-static void sad64x64x4d(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
-                        __m128i *res) {
-  uint32_t sum[4];
-  aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
-  *res = _mm_loadu_si128((const __m128i *)sum);
-}
-
-void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  unsigned int half_width = 64;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
-                            uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
-static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     const int h, const uint8_t *second_pred,
-                                     const int second_pred_stride) {
-  int i, res;
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
-  __m256i sum_sad = _mm256_setzero_si256();
-  __m256i sum_sad_h;
-  __m128i sum_sad128;
-  for (i = 0; i < h; i++) {
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
-    ref1_reg = _mm256_avg_epu8(
-        ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
-    ref2_reg = _mm256_avg_epu8(
-        ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
-    sad1_reg =
-        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
-    sad2_reg = _mm256_sad_epu8(
-        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
-    ref_ptr += ref_stride;
-    src_ptr += src_stride;
-    second_pred += second_pred_stride;
-  }
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
-  res = _mm_cvtsi128_si32(sum_sad128);
-
-  return res;
-}
-
-unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred) {
-  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                                  second_pred, 64);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  second_pred += 64 << 6;
-  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                          second_pred, 64);
-  return sum;
-}
-
-unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred) {
-  unsigned int half_width = 64;
-  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                                  second_pred, 128);
-  src_ptr += half_width;
-  ref_ptr += half_width;
-  second_pred += half_width;
-  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                          second_pred, 128);
-  return sum;
-}
-
-unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     const uint8_t *second_pred) {
-  uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
-                                        ref_stride, second_pred);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  second_pred += 128 << 6;
-  sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
-                                second_pred);
-  return sum;
-}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
deleted file mode 100644
index 3251b7655..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ /dev/null
@@ -1,353 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro SAD_FN 4
-%if %4 == 0
-%if %3 == 5
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
-                            src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-%else ; avg
-%if %3 == 5
-cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
-                                    second_pred, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
-                                              ref, ref_stride, \
-                                              second_pred, \
-                                              src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; avg/sad
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-%endmacro
-
-; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
-;                                  uint8_t *ref, int ref_stride);
-%macro SAD128XN 1-2 0
-  SAD_FN 128, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+32]
-  psadbw                m4, [srcq+48]
-
-  paddd                 m1, m2
-  paddd                 m3, m4
-  paddd                 m0, m1
-  paddd                 m0, m3
-
-  movu                  m1, [refq+64]
-  movu                  m2, [refq+80]
-  movu                  m3, [refq+96]
-  movu                  m4, [refq+112]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*4]
-  pavgb                 m2, [second_predq+mmsize*5]
-  pavgb                 m3, [second_predq+mmsize*6]
-  pavgb                 m4, [second_predq+mmsize*7]
-  lea         second_predq, [second_predq+mmsize*8]
-%endif
-  psadbw                m1, [srcq+64]
-  psadbw                m2, [srcq+80]
-  psadbw                m3, [srcq+96]
-  psadbw                m4, [srcq+112]
-
-  add                 refq, ref_strideq
-  add                 srcq, src_strideq
-
-  paddd                 m1, m2
-  paddd                 m3, m4
-  paddd                 m0, m1
-  paddd                 m0, m3
-
-  sub              n_rowsd, 1
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD128XN 128     ; sad128x128_sse2
-SAD128XN 128, 1  ; sad128x128_avg_sse2
-SAD128XN 64      ; sad128x64_sse2
-SAD128XN 64, 1   ; sad128x64_avg_sse2
-
-
-; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD64XN 1-2 0
-  SAD_FN 64, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+32]
-  psadbw                m4, [srcq+48]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  add                 refq, ref_strideq
-  paddd                 m0, m1
-  add                 srcq, src_strideq
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD64XN 128     ; sad64x128_sse2
-SAD64XN 128, 1  ; sad64x128_avg_sse2
-SAD64XN 64 ; sad64x64_sse2
-SAD64XN 32 ; sad64x32_sse2
-SAD64XN 64, 1 ; sad64x64_avg_sse2
-SAD64XN 32, 1 ; sad64x32_avg_sse2
-SAD64XN 16 ; sad64x16_sse2
-SAD64XN 16, 1 ; sad64x16_avg_sse2
-
-; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD32XN 1-2 0
-  SAD_FN 32, %1, 5, %2
-  mov              n_rowsd, %1/2
-  pxor                  m0, m0
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+ref_strideq]
-  movu                  m4, [refq+ref_strideq+16]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+src_strideq]
-  psadbw                m4, [srcq+src_strideq+16]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD32XN 64 ; sad32x64_sse2
-SAD32XN 32 ; sad32x32_sse2
-SAD32XN 16 ; sad32x16_sse2
-SAD32XN 64, 1 ; sad32x64_avg_sse2
-SAD32XN 32, 1 ; sad32x32_avg_sse2
-SAD32XN 16, 1 ; sad32x16_avg_sse2
-SAD32XN 8 ; sad_32x8_sse2
-SAD32XN 8, 1 ; sad_32x8_avg_sse2
-
-; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro SAD16XN 1-2 0
-  SAD_FN 16, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+ref_strideq]
-  movu                  m3, [refq+ref_strideq*2]
-  movu                  m4, [refq+ref_stride3q]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+src_strideq]
-  psadbw                m3, [srcq+src_strideq*2]
-  psadbw                m4, [srcq+src_stride3q]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD16XN 32 ; sad16x32_sse2
-SAD16XN 16 ; sad16x16_sse2
-SAD16XN  8 ; sad16x8_sse2
-SAD16XN 32, 1 ; sad16x32_avg_sse2
-SAD16XN 16, 1 ; sad16x16_avg_sse2
-SAD16XN  8, 1 ; sad16x8_avg_sse2
-SAD16XN 4 ; sad_16x4_sse2
-SAD16XN 4, 1 ; sad_16x4_avg_sse2
-SAD16XN 64 ; sad_16x64_sse2
-SAD16XN 64, 1 ; sad_16x64_avg_sse2
-
-; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
-;                                   uint8_t *ref, int ref_stride);
-%macro SAD8XN 1-2 0
-  SAD_FN 8, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-
-.loop:
-  movh                  m1, [refq]
-  movhps                m1, [refq+ref_strideq]
-  movh                  m2, [refq+ref_strideq*2]
-  movhps                m2, [refq+ref_stride3q]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  lea         second_predq, [second_predq+mmsize*2]
-%endif
-  movh                  m3, [srcq]
-  movhps                m3, [srcq+src_strideq]
-  movh                  m4, [srcq+src_strideq*2]
-  movhps                m4, [srcq+src_stride3q]
-  psadbw                m1, m3
-  psadbw                m2, m4
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m2
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD8XN 16 ; sad8x16_sse2
-SAD8XN  8 ; sad8x8_sse2
-SAD8XN  4 ; sad8x4_sse2
-SAD8XN 16, 1 ; sad8x16_avg_sse2
-SAD8XN  8, 1 ; sad8x8_avg_sse2
-SAD8XN  4, 1 ; sad8x4_avg_sse2
-SAD8XN 32 ; sad_8x32_sse2
-SAD8XN 32, 1 ; sad_8x32_avg_sse2
-
-; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
-;                                   uint8_t *ref, int ref_stride);
-%macro SAD4XN 1-2 0
-  SAD_FN 4, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-
-.loop:
-  movd                  m1, [refq]
-  movd                  m2, [refq+ref_strideq]
-  movd                  m3, [refq+ref_strideq*2]
-  movd                  m4, [refq+ref_stride3q]
-  punpckldq             m1, m2
-  punpckldq             m3, m4
-  movlhps               m1, m3
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  lea         second_predq, [second_predq+mmsize*1]
-%endif
-  movd                  m2, [srcq]
-  movd                  m5, [srcq+src_strideq]
-  movd                  m4, [srcq+src_strideq*2]
-  movd                  m3, [srcq+src_stride3q]
-  punpckldq             m2, m5
-  punpckldq             m4, m3
-  movlhps               m2, m4
-  psadbw                m1, m2
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD4XN  8 ; sad4x8_sse
-SAD4XN  4 ; sad4x4_sse
-SAD4XN  8, 1 ; sad4x8_avg_sse
-SAD4XN  4, 1 ; sad4x4_avg_sse
-SAD4XN 16 ; sad_4x16_sse2
-SAD4XN 16, 1 ; sad_4x16_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
deleted file mode 100644
index 305dde5c0..000000000
--- a/third_party/aom/aom_dsp/x86/sse_avx2.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <smmintrin.h>
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
-                                const uint8_t *b) {
-  const __m256i v_a0 = yy_loadu_256(a);
-  const __m256i v_b0 = yy_loadu_256(b);
-  const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
-  const __m256i v_a01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
-  const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
-  const __m256i v_b01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
-  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
-  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
-}
-
-static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
-  int64_t sum;
-  const __m256i sum0_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
-  const __m256i sum1_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
-  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
-  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
-                                         _mm256_extracti128_si256(sum_4x64, 1));
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int width, int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  __m256i sum = _mm256_setzero_si256();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_32(a);
-        const __m128i v_a1 = xx_loadl_32(a + a_stride);
-        const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
-        const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
-        const __m128i v_b0 = xx_loadl_32(b);
-        const __m128i v_b1 = xx_loadl_32(b + b_stride);
-        const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
-        const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
-        const __m128i v_a0123 = _mm_unpacklo_epi64(
-            _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
-        const __m128i v_b0123 = _mm_unpacklo_epi64(
-            _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 2;
-        b += b_stride << 2;
-        y += 4;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 8:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m256i v_a_w =
-            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
-        const __m256i v_b_w =
-            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 16:
-      do {
-        const __m128i v_a0 = xx_loadu_128(a);
-        const __m128i v_b0 = xx_loadu_128(b);
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 32:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 64:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 128:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        sse_w32_avx2(&sum, a + 64, b + 64);
-        sse_w32_avx2(&sum, a + 96, b + 96);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    default: break;
-  }
-
-  return sse;
-}
-
-static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
-                                       const uint16_t *b) {
-  const __m256i v_a_w = yy_loadu_256(a);
-  const __m256i v_b_w = yy_loadu_256(b);
-  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  __m256i sum = _mm256_setzero_si256();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
-        const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
-        const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
-        const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
-                                           _mm_unpacklo_epi64(v_a2, v_a3));
-        const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
-                                           _mm_unpacklo_epi64(v_b2, v_b3));
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 2;
-        b += b_stride << 2;
-        y += 4;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 8:
-      do {
-        const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
-        const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 16:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 32:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 64:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
-        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
-        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 128:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
-        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
-        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
-        highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
-        highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
-        highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
-        highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    default: break;
-  }
-  return sse;
-}
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
deleted file mode 100644
index 8b5af8469..000000000
--- a/third_party/aom/aom_dsp/x86/sse_sse4.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
-  int64_t sum;
-  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
-  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
-  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
-                                  const uint8_t *b) {
-  const __m128i v_a0 = xx_loadu_128(a);
-  const __m128i v_b0 = xx_loadu_128(b);
-  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
-  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
-  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
-  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
-  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
-  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
-}
-
-int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
-                       int b_stride, int width, int height) {
-  int y = 0;
-  int64_t sse = 0;
-  __m128i sum = _mm_setzero_si128();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_32(a);
-        const __m128i v_a1 = xx_loadl_32(a + a_stride);
-        const __m128i v_b0 = xx_loadl_32(b);
-        const __m128i v_b1 = xx_loadl_32(b + b_stride);
-        const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
-        const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 8:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
-        const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 16:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 32:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 64:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 128:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
-        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
-        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
-        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    default: break;
-  }
-
-  return sse;
-}
-
-static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
-                                        const uint16_t *b) {
-  const __m128i v_a_w = xx_loadu_128(a);
-  const __m128i v_b_w = xx_loadu_128(b);
-  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int width,
-                              int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  __m128i sum = _mm_setzero_si128();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
-        const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 8:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 16:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 32:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 64:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 128:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    default: break;
-  }
-  return sse;
-}
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
deleted file mode 100644
index 6d9b5a12f..000000000
--- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
+++ /dev/null
@@ -1,222 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-
-SECTION .text
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(aom_ssim_parms_16x16_sse2) PRIVATE
-sym(aom_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(aom_ssim_parms_8x8_sse2) PRIVATE
-sym(aom_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
deleted file mode 100644
index 45bf6ec3c..000000000
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ /dev/null
@@ -1,1481 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times  8 dw  8
-bilin_filter_m_sse2: times  8 dw 16
-                     times  8 dw  0
-                     times  8 dw 14
-                     times  8 dw  2
-                     times  8 dw 12
-                     times  8 dw  4
-                     times  8 dw 10
-                     times  8 dw  6
-                     times 16 dw  8
-                     times  8 dw  6
-                     times  8 dw 10
-                     times  8 dw  4
-                     times  8 dw 12
-                     times  8 dw  2
-                     times  8 dw 14
-
-bilin_filter_m_ssse3: times  8 db 16,  0
-                      times  8 db 14,  2
-                      times  8 db 12,  4
-                      times  8 db 10,  6
-                      times 16 db  8
-                      times  8 db  6, 10
-                      times  8 db  4, 12
-                      times  8 db  2, 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
-;                               int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
-  psubw                %3, %4
-  psubw                %1, %2
-  paddw                %5, %3
-  pmaddwd              %3, %3
-  paddw                %5, %1
-  pmaddwd              %1, %1
-  paddd                %6, %3
-  paddd                %6, %1
-%endmacro
-
-%macro STORE_AND_RET 1
-%if %1 > 4
-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
-  ; We have to sign-extend it before adding the words within the register
-  ; and outputing to a dword.
-  pcmpgtw              m5, m6           ; mask for 0 > x
-  movhlps              m3, m7
-  punpcklwd            m4, m6, m5
-  punpckhwd            m6, m5           ; sign-extend m6 word->dword
-  paddd                m7, m3
-  paddd                m6, m4
-  pshufd               m3, m7, 0x1
-  movhlps              m4, m6
-  paddd                m7, m3
-  paddd                m6, m4
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  pshufd               m4, m6, 0x1
-  movd               [r1], m7           ; store sse
-  paddd                m6, m4
-  movd               raxd, m6           ; store sum as return value
-%else ; 4xh
-  pshuflw              m4, m6, 0xe
-  pshuflw              m3, m7, 0xe
-  paddw                m6, m4
-  paddd                m7, m3
-  pcmpgtw              m5, m6           ; mask for 0 > x
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  punpcklwd            m6, m5           ; sign-extend m6 word->dword
-  movd               [r1], m7           ; store sse
-  pshuflw              m4, m6, 0xe
-  paddd                m6, m4
-  movd               raxd, m6           ; store sum as return value
-%endif
-  RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  add                srcq, src_stridemp
-%else
-  add                srcq, src_strideq
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%if cpuflag(ssse3)
-%define bilin_filter_m bilin_filter_m_ssse3
-%define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-%endif
-; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
-; 11, not 13, if the registers are ordered correctly. May make a minor speed
-; difference on Win64
-
-%if ARCH_X86_64
-  %if %2 == 1 ; avg
-    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                        x_offset, y_offset, dst, dst_stride, \
-                                        sec, sec_stride, height, sse
-    %define sec_str sec_strideq
-  %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
-                                    x_offset, y_offset, dst, dst_stride, \
-                                    height, sse
-  %endif
-  %define block_height heightd
-  %define bilin_filter sseq
-%else
-  %if CONFIG_PIC=1
-    %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                          x_offset, y_offset, dst, dst_stride, \
-                                          sec, sec_stride, height, sse, \
-                                          g_bilin_filter, g_pw_8
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                      x_offset, y_offset, dst, dst_stride, \
-                                      height, sse, g_bilin_filter, g_pw_8
-      %define block_height heightd
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
-  %else
-    %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                          x_offset, y_offset, \
-                                          dst, dst_stride, sec, sec_stride, \
-                                          height, sse
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-    %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                      x_offset, y_offset, dst, dst_stride, \
-                                      height, sse
-      %define block_height heightd
-    %endif
-    %define bilin_filter bilin_filter_m
-  %endif
-%endif
-
-%if %1 == 4
-  %define movx movd
-%else
-  %define movx movh
-%endif
-
-  ASSERT               %1 <= 16         ; m6 overflows if w > 16
-  pxor                 m6, m6           ; sum
-  pxor                 m7, m7           ; sse
-  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
-  ; could perhaps use it for something more productive then
-  pxor                 m5, m5           ; dedicated zero register
-%if %1 < 16
-  sar                   block_height, 1
-%if %2 == 1 ; avg
-  shl             sec_str, 1
-%endif
-%endif
-
-  ; FIXME(rbultje) replace by jumptable?
-  test          x_offsetd, x_offsetd
-  jnz .x_nonzero
-  ; x_offset == 0
-  test          y_offsetd, y_offsetd
-  jnz .x_zero_y_nonzero
-
-  ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  mova                 m1, [dstq]
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%endif
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-
-%if %2 == 0 ; !avg
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m0, [srcq+src_strideq]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-%endif
-%else ; !avg
-  movx                 m2, [srcq+src_strideq]
-%endif
-
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-
-%if %2 == 1 ; avg
-%if %1 > 4
-  pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%if %1 > 4
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_zero_loop
-  STORE_AND_RET %1
-
-.x_zero_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_zero_y_nonhalf
-
-  ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-%endif
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m2, [srcq+src_strideq]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m2, [srcq+src_strideq*2]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq*2]
-  punpckldq            m2, m1
-%endif
-  movx                 m1, [dstq]
-%if %1 > 4
-  movlhps              m0, m2
-%else ; 4xh
-  punpckldq            m0, m2
-%endif
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpcklbw            m3, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m4, [secq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m4, [srcq+src_strideq*2]
-  movx                 m1, [dstq]
-  pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m2, m4
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_half_loop
-  STORE_AND_RET %1
-
-.x_zero_y_nonhalf:
-  ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  punpcklbw            m0, m5
-  punpcklbw            m4, m5
-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
-  ; slightly faster because of pmullw latency. It would also cut our rodata
-  ; tables in half for this function, and save 1-2 registers on x86-64.
-  pmullw               m2, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m2, m3
-  paddw                m0, m4
-%endif
-  psraw                m2, 4
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m2, [srcq+src_strideq]
-  movx                 m4, [srcq+src_strideq*2]
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  movx                 m1, [dstq]
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m1, m2, filter_y_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, m1
-  paddw                m2, filter_rnd
-  movx                 m1, [dstq]
-  paddw                m2, m4
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonzero:
-  cmp           x_offsetd, 4
-  jne .x_nonhalf
-  ; x_offset == 0.5
-  test          y_offsetd, y_offsetd
-  jnz .x_half_y_nonzero
-
-  ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-%endif
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m4, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m0, [srcq+src_strideq]
-  movhps               m4, [srcq+src_strideq+1]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-  movx                 m2, [srcq+src_strideq+1]
-  punpckldq            m4, m2
-%endif
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m2, [srcq+src_strideq]
-  movx                 m1, [dstq]
-  pavgb                m0, m4
-  movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m2, m4
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_zero_loop
-  STORE_AND_RET %1
-
-.x_half_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_half_y_nonhalf
-
-  ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_half_loop:
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m4, m3
-  punpckhbw            m3, m1, m5
-  pavgb                m0, m4
-%if %2 == 1 ; avg
-  punpcklbw            m1, m5
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_half_loop:
-  movx                 m2, [srcq]
-  movx                 m3, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m2, [srcq+src_strideq]
-  movhps               m3, [srcq+src_strideq+1]
-%else
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m2, m1
-  movx                 m1, [srcq+src_strideq+1]
-  punpckldq            m3, m1
-%endif
-  pavgb                m2, m3
-%if %1 > 4
-  movlhps              m0, m2
-  movhlps              m4, m2
-%else ; 4xh
-  punpckldq            m0, m2
-  pshuflw              m4, m2, 0xe
-%endif
-  movx                 m1, [dstq]
-  pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
-%if %1 > 4
-  pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%if %1 > 4
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m4, [srcq+src_strideq]
-  movx                 m1, [srcq+src_strideq+1]
-  pavgb                m2, m3
-  pavgb                m4, m1
-  pavgb                m0, m2
-  pavgb                m2, m4
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_half_loop
-  STORE_AND_RET %1
-
-.x_half_y_nonhalf:
-  ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else  ;x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_other_loop:
-  movu                 m4, [srcq]
-  movu                 m2, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m4, m2
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  pmullw               m2, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, filter_rnd
-  punpcklbw            m0, m5
-  paddw                m2, m3
-  punpcklbw            m3, m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  paddw                m0, m3
-%endif
-  punpckhbw            m3, m1, m5
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-%if notcpuflag(ssse3)
-  punpcklbw            m0, m5
-%endif
-.x_half_y_other_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m4, [srcq+src_strideq]
-  movx                 m3, [srcq+src_strideq+1]
-  pavgb                m2, m1
-  pavgb                m4, m3
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  movx                 m1, [dstq]
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m1, m2, filter_y_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  paddw                m0, m1
-  pmullw               m1, m4, filter_y_b
-  paddw                m2, filter_rnd
-  paddw                m2, m1
-  movx                 m1, [dstq]
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf:
-  test          y_offsetd, y_offsetd
-  jnz .x_nonhalf_y_nonzero
-
-  ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-;y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  punpcklbw            m0, m5
-  punpcklbw            m4, m5
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m2, m3
-  paddw                m0, m4
-%endif
-  psraw                m2, 4
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m2, [srcq+src_strideq]
-  movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  movx                 m1, [dstq]
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_x_a
-  pmaddubsw            m2, filter_x_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m0, m1
-  paddw                m2, filter_rnd
-  movx                 m1, [dstq]
-  paddw                m2, m4
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_nonhalf_y_nonhalf
-
-  ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m1
-  punpcklbw            m0, m1
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m0, m1
-  paddw                m2, m3
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-  add                srcq, src_strideq
-  packuswb             m0, m2
-.x_other_y_half_loop:
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-%if cpuflag(ssse3)
-  mova                 m1, [dstq]
-  punpckhbw            m2, m4, m3
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m4, m2
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%else
-  punpckhbw            m2, m4, m5
-  punpckhbw            m1, m3, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m4, m3
-  paddw                m2, m1
-  mova                 m1, [dstq]
-  psraw                m4, 4
-  psraw                m2, 4
-  punpckhbw            m3, m1, m5
-  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
-  ; have a 1-register shortage to be able to store the backup of the bilin
-  ; filtered second line as words as cache for the next line. Packing into
-  ; a byte costs 1 pack and 2 unpacks, but saves a register.
-  packuswb             m4, m2
-  punpcklbw            m1, m5
-  pavgb                m0, m4
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  pavgb                m0, [secq]
-%endif
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  pmaddubsw            m0, filter_x_a
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m1
-%endif
-  add                srcq, src_strideq
-  psraw                m0, 4
-.x_other_y_half_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m4, [srcq+src_strideq]
-  movx                 m3, [srcq+src_strideq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m2, m1
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m1, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  paddw                m2, m1
-  movx                 m1, [dstq]
-  paddw                m4, m3
-  movx                 m3, [dstq+dst_strideq]
-%endif
-  psraw                m2, 4
-  psraw                m4, 4
-  pavgw                m0, m2
-  pavgw                m2, m4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline - also consider going to bytes here
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf_y_nonhalf:
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                m11, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
-  mov tempq, g_bilin_filterm
-  add           x_offsetq, tempq
-  add           y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-  add           y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-  ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m1
-  punpcklbw            m0, m1
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m0, m1
-  paddw                m2, m3
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-  packuswb             m0, m2
-.x_other_y_other_loop:
-%if cpuflag(ssse3)
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
-  punpckhbw            m2, m4, m3
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  punpckhbw            m3, m1, m5
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m4, m2
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  punpcklbw            m1, m5
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  psraw                m0, 4
-%else
-  movu                 m3, [srcq]
-  movu                 m4, [srcq+1]
-  punpckhbw            m1, m3, m5
-  punpckhbw            m2, m4, m5
-  punpcklbw            m3, m5
-  punpcklbw            m4, m5
-  pmullw               m3, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m3, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m3, m4
-  paddw                m1, m2
-  psraw                m3, 4
-  psraw                m1, 4
-  packuswb             m4, m3, m1
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  pmullw               m2, filter_y_a
-  pmullw               m1, filter_y_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, m1
-  mova                 m1, [dstq]
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  paddw                m0, m3
-  punpckhbw            m3, m1, m5
-  psraw                m0, 4
-  punpcklbw            m1, m5
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  INC_SRC_BY_SRC_STRIDE
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  pmaddubsw            m0, filter_x_a
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m1
-%endif
-  psraw                m0, 4
-%if cpuflag(ssse3)
-  packuswb             m0, m0
-%endif
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-
-  INC_SRC_BY_SRC_STRIDE
-  movx                 m4, [srcq]
-  movx                 m3, [srcq+1]
-
-%if cpuflag(ssse3)
-  punpcklbw            m2, m1
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  movx                 m3, [dstq+dst_strideq]
-  movx                 m1, [dstq]
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m2, m2
-  packuswb             m4, m4
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-  psraw                m0, 4
-  psraw                m2, 4
-  punpcklbw            m1, m5
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m1, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  paddw                m2, m1
-  paddw                m4, m3
-  psraw                m2, 4
-  psraw                m4, 4
-  pmullw               m0, filter_y_a
-  pmullw               m3, m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  pmullw               m1, m4, filter_y_b
-  paddw                m2, filter_rnd
-  paddw                m0, m3
-  movx                 m3, [dstq+dst_strideq]
-  paddw                m2, m1
-  movx                 m1, [dstq]
-  psraw                m0, 4
-  psraw                m2, 4
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-%undef movx
-  STORE_AND_RET %1
-%endmacro
-
-; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
-; between the ssse3 and non-ssse3 version. It may make sense to merge their
-; code in the sense that the ssse3 version would jump to the appropriate
-; location in the sse/2 version, rather than duplicating that code in the
-; binary.
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
deleted file mode 100644
index 4389d123d..000000000
--- a/third_party/aom/aom_dsp/x86/subtract_avx2.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
-                                   const uint8_t *pred_ptr) {
-  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
-  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
-  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
-  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
-  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
-  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
-  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
-  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
-}
-
-static INLINE void aom_subtract_block_16xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
-    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
-    __m256i s_0 = _mm256_cvtepu8_epi16(s);
-    __m256i p_0 = _mm256_cvtepu8_epi16(p);
-    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void aom_subtract_block_32xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void aom_subtract_block_64xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void aom_subtract_block_128xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
-    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
-    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
-                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                             ptrdiff_t pred_stride) {
-  switch (cols) {
-    case 16:
-      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
-      break;
-    case 32:
-      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
-      break;
-    case 64:
-      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
-      break;
-    case 128:
-      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                    src_stride, pred_ptr, pred_stride);
-      break;
-    default:
-      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
-                              src_stride, pred_ptr, pred_stride);
-      break;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
deleted file mode 100644
index 1a75a234f..000000000
--- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm
+++ /dev/null
@@ -1,146 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; void aom_subtract_block(int rows, int cols,
-;                         int16_t *diff, ptrdiff_t diff_stride,
-;                         const uint8_t *src, ptrdiff_t src_stride,
-;                         const uint8_t *pred, ptrdiff_t pred_stride)
-
-INIT_XMM sse2
-cglobal subtract_block, 7, 7, 8, \
-                        rows, cols, diff, diff_stride, src, src_stride, \
-                        pred, pred_stride
-%define pred_str colsq
-  pxor                  m7, m7         ; dedicated zero register
-  cmp                colsd, 4
-  je .case_4
-  cmp                colsd, 8
-  je .case_8
-  cmp                colsd, 16
-  je .case_16
-  cmp                colsd, 32
-  je .case_32
-  cmp                colsd, 64
-  je .case_64
-
-%macro loop16 6
-  mova                  m0, [srcq+%1]
-  mova                  m4, [srcq+%2]
-  mova                  m1, [predq+%3]
-  mova                  m5, [predq+%4]
-  punpckhbw             m2, m0, m7
-  punpckhbw             m3, m1, m7
-  punpcklbw             m0, m7
-  punpcklbw             m1, m7
-  psubw                 m2, m3
-  psubw                 m0, m1
-  punpckhbw             m1, m4, m7
-  punpckhbw             m3, m5, m7
-  punpcklbw             m4, m7
-  punpcklbw             m5, m7
-  psubw                 m1, m3
-  psubw                 m4, m5
-  mova [diffq+mmsize*0+%5], m0
-  mova [diffq+mmsize*1+%5], m2
-  mova [diffq+mmsize*0+%6], m4
-  mova [diffq+mmsize*1+%6], m1
-%endmacro
-
-  mov             pred_str, pred_stridemp
-.loop_128:
-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
-  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
-  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  sub                rowsd, 1
-  jnz .loop_128
-  RET
-
-.case_64:
-  mov             pred_str, pred_stridemp
-.loop_64:
-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  dec                rowsd
-  jg .loop_64
-  RET
-
-.case_32:
-  mov             pred_str, pred_stridemp
-.loop_32:
-  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  dec                rowsd
-  jg .loop_32
-  RET
-
-.case_16:
-  mov             pred_str, pred_stridemp
-.loop_16:
-  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                predq, [predq+pred_str*2]
-  lea                 srcq, [srcq+src_strideq*2]
-  sub                rowsd, 2
-  jg .loop_16
-  RET
-
-%macro loop_h 0
-  movh                  m0, [srcq]
-  movh                  m2, [srcq+src_strideq]
-  movh                  m1, [predq]
-  movh                  m3, [predq+pred_str]
-  punpcklbw             m0, m7
-  punpcklbw             m1, m7
-  punpcklbw             m2, m7
-  punpcklbw             m3, m7
-  psubw                 m0, m1
-  psubw                 m2, m3
-  mova             [diffq], m0
-  mova [diffq+diff_strideq*2], m2
-%endmacro
-
-.case_8:
-  mov             pred_str, pred_stridemp
-.loop_8:
-  loop_h
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                 srcq, [srcq+src_strideq*2]
-  lea                predq, [predq+pred_str*2]
-  sub                rowsd, 2
-  jg .loop_8
-  RET
-
-INIT_MMX
-.case_4:
-  mov             pred_str, pred_stridemp
-.loop_4:
-  loop_h
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                 srcq, [srcq+src_strideq*2]
-  lea                predq, [predq+pred_str*2]
-  sub                rowsd, 2
-  jg .loop_4
-  RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
deleted file mode 100644
index 0af44e3a4..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_dsp/x86/sum_squares_sse2.h"
-#include "config/aom_dsp_rtcd.h"
-
-static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
-                                                int width, int height) {
-  uint64_t result;
-  __m256i v_acc_q = _mm256_setzero_si256();
-  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
-  for (int col = 0; col < height; col += 4) {
-    __m256i v_acc_d = _mm256_setzero_si256();
-    for (int row = 0; row < width; row += 16) {
-      const int16_t *tempsrc = src + row;
-      const __m256i v_val_0_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
-      const __m256i v_val_1_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
-      const __m256i v_val_2_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
-      const __m256i v_val_3_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
-
-      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
-
-      const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
-
-      v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
-    }
-    v_acc_q =
-        _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
-    src += 4 * stride;
-  }
-  __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
-  __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
-  __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
-
-  result_64_2_int = _mm_add_epi64(
-      result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
-
-  xx_storel_64(&result, result_64_2_int);
-
-  return result;
-}
-
-uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
-                                     int height) {
-  if (LIKELY(width == 4 && height == 4)) {
-    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
-    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
-  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
-    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
-  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
-    return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
-  } else {
-    return aom_sum_squares_2d_i16_c(src, stride, width, height);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
deleted file mode 100644
index 22d7739ec..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stdio.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/sum_squares_sse2.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
-  const __m128d ad = _mm_castsi128_pd(a);
-  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
-}
-
-static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(a);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, a);
-    return tmp;
-  }
-#endif
-}
-
-static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
-  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
-  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
-  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
-  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
-  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
-  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
-
-  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
-}
-
-uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
-  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
-  __m128i v_sum_d =
-      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
-  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
-                                         int height) {
-  int r = 0;
-  __m128i v_acc_q = _mm_setzero_si128();
-  do {
-    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
-    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
-    src += stride << 2;
-    r += 4;
-  } while (r < height);
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
-  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
-                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
-  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
-  return xx_cvtsi128_si64(v_acc_64);
-}
-
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-uint64_t
-aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
-                                int height) {
-  int r = 0;
-
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
-  __m128i v_acc_q = _mm_setzero_si128();
-
-  do {
-    __m128i v_acc_d = _mm_setzero_si128();
-    int c = 0;
-    do {
-      const int16_t *b = src + c;
-
-      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
-      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
-      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
-      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
-
-      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-
-      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      c += 8;
-    } while (c < width);
-
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
-
-    src += 4 * stride;
-    r += 4;
-  } while (r < height);
-
-  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-  return xx_cvtsi128_si64(v_acc_q);
-}
-
-uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
-                                     int height) {
-  // 4 elements per row only requires half an XMM register, so this
-  // must be a special case, but also note that over 75% of all calls
-  // are with size == 4, so it is also the common case.
-  if (LIKELY(width == 4 && height == 4)) {
-    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
-    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
-  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
-    // Generic case
-    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
-  } else {
-    return aom_sum_squares_2d_i16_c(src, stride, width, height);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// 1D version
-//////////////////////////////////////////////////////////////////////////////
-
-static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
-  __m128i v_acc0_q = _mm_setzero_si128();
-  __m128i v_acc1_q = _mm_setzero_si128();
-
-  const int16_t *const end = src + n;
-
-  assert(n % 64 == 0);
-
-  while (src < end) {
-    const __m128i v_val_0_w = xx_load_128(src);
-    const __m128i v_val_1_w = xx_load_128(src + 8);
-    const __m128i v_val_2_w = xx_load_128(src + 16);
-    const __m128i v_val_3_w = xx_load_128(src + 24);
-    const __m128i v_val_4_w = xx_load_128(src + 32);
-    const __m128i v_val_5_w = xx_load_128(src + 40);
-    const __m128i v_val_6_w = xx_load_128(src + 48);
-    const __m128i v_val_7_w = xx_load_128(src + 56);
-
-    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
-    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
-    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
-    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
-
-    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
-    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
-
-    src += 64;
-  }
-
-  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
-  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
-  return xx_cvtsi128_si64(v_acc0_q);
-}
-
-uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
-  if (n % 64 == 0) {
-    return aom_sum_squares_i16_64n_sse2(src, n);
-  } else if (n > 64) {
-    int k = n & ~(64 - 1);
-    return aom_sum_squares_i16_64n_sse2(src, k) +
-           aom_sum_squares_i16_c(src + k, n - k);
-  } else {
-    return aom_sum_squares_i16_c(src, n);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
deleted file mode 100644
index 491e31cc5..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
-#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
-
-uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
-                                         int width, int height);
-
-uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
-                                         int height);
-uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
-
-#endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
deleted file mode 100644
index 1e9f1e27b..000000000
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
-#define AOM_AOM_DSP_X86_SYNONYMS_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-/**
- * Various reusable shorthands for x86 SIMD intrinsics.
- *
- * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
- * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
- */
-
-// Loads and stores to do away with the tedium of casting the address
-// to the right type.
-static INLINE __m128i xx_loadl_32(const void *a) {
-  return _mm_cvtsi32_si128(*(const uint32_t *)a);
-}
-
-static INLINE __m128i xx_loadl_64(const void *a) {
-  return _mm_loadl_epi64((const __m128i *)a);
-}
-
-static INLINE __m128i xx_load_128(const void *a) {
-  return _mm_load_si128((const __m128i *)a);
-}
-
-static INLINE __m128i xx_loadu_128(const void *a) {
-  return _mm_loadu_si128((const __m128i *)a);
-}
-
-static INLINE void xx_storel_32(void *const a, const __m128i v) {
-  *(uint32_t *)a = _mm_cvtsi128_si32(v);
-}
-
-static INLINE void xx_storel_64(void *const a, const __m128i v) {
-  _mm_storel_epi64((__m128i *)a, v);
-}
-
-static INLINE void xx_store_128(void *const a, const __m128i v) {
-  _mm_store_si128((__m128i *)a, v);
-}
-
-static INLINE void xx_storeu_128(void *const a, const __m128i v) {
-  _mm_storeu_si128((__m128i *)a, v);
-}
-
-// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set_epi64x()
-// acting on 32-bit integers.
-static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, e1, 0, e0);
-#else
-  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
-#endif
-}
-
-// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, a, 0, a);
-#else
-  return _mm_set1_epi64x((uint32_t)a);
-#endif
-}
-
-static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
-  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
-  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
-  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
-  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
-  const __m128i v_tmp_d =
-      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
-  return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-#endif  // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
deleted file mode 100644
index 3f69b120e..000000000
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
-#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-/**
- * Various reusable shorthands for x86 SIMD intrinsics.
- *
- * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
- * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
- */
-
-// Loads and stores to do away with the tedium of casting the address
-// to the right type.
-static INLINE __m256i yy_load_256(const void *a) {
-  return _mm256_load_si256((const __m256i *)a);
-}
-
-static INLINE __m256i yy_loadu_256(const void *a) {
-  return _mm256_loadu_si256((const __m256i *)a);
-}
-
-static INLINE void yy_store_256(void *const a, const __m256i v) {
-  _mm256_store_si256((__m256i *)a, v);
-}
-
-static INLINE void yy_storeu_256(void *const a, const __m256i v) {
-  _mm256_storeu_si256((__m256i *)a, v);
-}
-
-// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm256_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
-#else
-  return _mm256_set1_epi64x((uint32_t)a);
-#endif
-}
-
-// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
-// therefore define an equivalent function using a different intrinsic.
-// ([ hi ], [ lo ]) -> [ hi ][ lo ]
-static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-}
-
-static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
-  __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
-  __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
-  return yy_set_m128i(mhi, mlo);
-}
-
-static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
-  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
-  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
-}
-#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
deleted file mode 100644
index d0d1ee684..000000000
--- a/third_party/aom/aom_dsp/x86/transpose_sse2.h
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
-#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-
-static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 10 11 12 13
-  // in[2]: 20 21 22 23
-  // in[3]: 30 31 32 33
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
-
-  // Unpack 32 bit elements resulting in:
-  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
-  return _mm_unpacklo_epi16(a0, a1);
-}
-
-static INLINE void transpose_8bit_8x8(const __m128i *const in,
-                                      __m128i *const out) {
-  // Unpack 8 bit elements. Goes from:
-  // in[0]: 00 01 02 03 04 05 06 07
-  // in[1]: 10 11 12 13 14 15 16 17
-  // in[2]: 20 21 22 23 24 25 26 27
-  // in[3]: 30 31 32 33 34 35 36 37
-  // in[4]: 40 41 42 43 44 45 46 47
-  // in[5]: 50 51 52 53 54 55 56 57
-  // in[6]: 60 61 62 63 64 65 66 67
-  // in[7]: 70 71 72 73 74 75 76 77
-  // to:
-  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
-  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
-  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
-  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
-  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
-  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
-  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
-
-  // Unpack 16 bit elements resulting in:
-  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
-  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
-  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
-  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
-
-  // Unpack 32 bit elements resulting in:
-  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
-  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
-  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
-  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30 40 50 60 70
-  // out[1]: 01 11 21 31 41 51 61 71
-  // out[2]: 02 12 22 32 42 52 62 72
-  // out[3]: 03 13 23 33 43 53 63 73
-  // out[4]: 04 14 24 34 44 54 64 74
-  // out[5]: 05 15 25 35 45 55 65 75
-  // out[6]: 06 16 26 36 46 56 66 76
-  // out[7]: 07 17 27 37 47 57 67 77
-  out[0] = _mm_unpacklo_epi64(c0, c0);
-  out[1] = _mm_unpackhi_epi64(c0, c0);
-  out[2] = _mm_unpacklo_epi64(c1, c1);
-  out[3] = _mm_unpackhi_epi64(c1, c1);
-  out[4] = _mm_unpacklo_epi64(c2, c2);
-  out[5] = _mm_unpackhi_epi64(c2, c2);
-  out[6] = _mm_unpacklo_epi64(c3, c3);
-  out[7] = _mm_unpackhi_epi64(c3, c3);
-}
-
-static INLINE void transpose_16bit_4x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  XX XX XX XX
-  // in[1]: 10 11 12 13  XX XX XX XX
-  // in[2]: 20 21 22 23  XX XX XX XX
-  // in[3]: 30 31 32 33  XX XX XX XX
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-
-  // Unpack 32 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  out[0] = _mm_unpacklo_epi32(a0, a1);
-  out[1] = _mm_srli_si128(out[0], 8);
-  out[2] = _mm_unpackhi_epi32(a0, a1);
-  out[3] = _mm_srli_si128(out[2], 8);
-}
-
-static INLINE void transpose_16bit_4x8(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  XX XX XX XX
-  // in[1]: 10 11 12 13  XX XX XX XX
-  // in[2]: 20 21 22 23  XX XX XX XX
-  // in[3]: 30 31 32 33  XX XX XX XX
-  // in[4]: 40 41 42 43  XX XX XX XX
-  // in[5]: 50 51 52 53  XX XX XX XX
-  // in[6]: 60 61 62 63  XX XX XX XX
-  // in[7]: 70 71 72 73  XX XX XX XX
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  // a2:    40 50 41 51  42 52 43 53
-  // a3:    60 70 61 71  62 72 63 73
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  // Unpack 32 bit elements resulting in:
-  // b0: 00 10 20 30  01 11 21 31
-  // b1: 40 50 60 70  41 51 61 71
-  // b2: 02 12 22 32  03 13 23 33
-  // b3: 42 52 62 72  43 53 63 73
-  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
-  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
-  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30  40 50 60 70
-  // out[1]: 01 11 21 31  41 51 61 71
-  // out[2]: 02 12 22 32  42 52 62 72
-  // out[3]: 03 13 23 33  43 53 63 73
-  out[0] = _mm_unpacklo_epi64(b0, b1);
-  out[1] = _mm_unpackhi_epi64(b0, b1);
-  out[2] = _mm_unpacklo_epi64(b2, b3);
-  out[3] = _mm_unpackhi_epi64(b2, b3);
-}
-
-static INLINE void transpose_16bit_8x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  04 05 06 07
-  // in[1]: 10 11 12 13  14 15 16 17
-  // in[2]: 20 21 22 23  24 25 26 27
-  // in[3]: 30 31 32 33  34 35 36 37
-
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  // a4:    04 14 05 15  06 16 07 17
-  // a5:    24 34 25 35  26 36 27 37
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
-
-  // Unpack 32 bit elements resulting in:
-  // b0: 00 10 20 30  01 11 21 31
-  // b2: 04 14 24 34  05 15 25 35
-  // b4: 02 12 22 32  03 13 23 33
-  // b6: 06 16 26 36  07 17 27 37
-  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
-  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
-  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30  XX XX XX XX
-  // out[1]: 01 11 21 31  XX XX XX XX
-  // out[2]: 02 12 22 32  XX XX XX XX
-  // out[3]: 03 13 23 33  XX XX XX XX
-  // out[4]: 04 14 24 34  XX XX XX XX
-  // out[5]: 05 15 25 35  XX XX XX XX
-  // out[6]: 06 16 26 36  XX XX XX XX
-  // out[7]: 07 17 27 37  XX XX XX XX
-  const __m128i zeros = _mm_setzero_si128();
-  out[0] = _mm_unpacklo_epi64(b0, zeros);
-  out[1] = _mm_unpackhi_epi64(b0, zeros);
-  out[2] = _mm_unpacklo_epi64(b4, zeros);
-  out[3] = _mm_unpackhi_epi64(b4, zeros);
-  out[4] = _mm_unpacklo_epi64(b2, zeros);
-  out[5] = _mm_unpackhi_epi64(b2, zeros);
-  out[6] = _mm_unpacklo_epi64(b6, zeros);
-  out[7] = _mm_unpackhi_epi64(b6, zeros);
-}
-
-static INLINE void transpose_16bit_8x8(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  04 05 06 07
-  // in[1]: 10 11 12 13  14 15 16 17
-  // in[2]: 20 21 22 23  24 25 26 27
-  // in[3]: 30 31 32 33  34 35 36 37
-  // in[4]: 40 41 42 43  44 45 46 47
-  // in[5]: 50 51 52 53  54 55 56 57
-  // in[6]: 60 61 62 63  64 65 66 67
-  // in[7]: 70 71 72 73  74 75 76 77
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  // a2:    40 50 41 51  42 52 43 53
-  // a3:    60 70 61 71  62 72 63 73
-  // a4:    04 14 05 15  06 16 07 17
-  // a5:    24 34 25 35  26 36 27 37
-  // a6:    44 54 45 55  46 56 47 57
-  // a7:    64 74 65 75  66 76 67 77
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  // Unpack 32 bit elements resulting in:
-  // b0: 00 10 20 30  01 11 21 31
-  // b1: 40 50 60 70  41 51 61 71
-  // b2: 04 14 24 34  05 15 25 35
-  // b3: 44 54 64 74  45 55 65 75
-  // b4: 02 12 22 32  03 13 23 33
-  // b5: 42 52 62 72  43 53 63 73
-  // b6: 06 16 26 36  07 17 27 37
-  // b7: 46 56 66 76  47 57 67 77
-  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
-  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
-  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
-  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
-  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
-  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
-  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30  40 50 60 70
-  // out[1]: 01 11 21 31  41 51 61 71
-  // out[2]: 02 12 22 32  42 52 62 72
-  // out[3]: 03 13 23 33  43 53 63 73
-  // out[4]: 04 14 24 34  44 54 64 74
-  // out[5]: 05 15 25 35  45 55 65 75
-  // out[6]: 06 16 26 36  46 56 66 76
-  // out[7]: 07 17 27 37  47 57 67 77
-  out[0] = _mm_unpacklo_epi64(b0, b1);
-  out[1] = _mm_unpackhi_epi64(b0, b1);
-  out[2] = _mm_unpacklo_epi64(b4, b5);
-  out[3] = _mm_unpackhi_epi64(b4, b5);
-  out[4] = _mm_unpacklo_epi64(b2, b3);
-  out[5] = _mm_unpackhi_epi64(b2, b3);
-  out[6] = _mm_unpacklo_epi64(b6, b7);
-  out[7] = _mm_unpackhi_epi64(b6, b7);
-}
-
-// Transpose in-place
-static INLINE void transpose_16bit_16x16(__m128i *const left,
-                                         __m128i *const right) {
-  __m128i tbuf[8];
-  transpose_16bit_8x8(left, left);
-  transpose_16bit_8x8(right, tbuf);
-  transpose_16bit_8x8(left + 8, right);
-  transpose_16bit_8x8(right + 8, right + 8);
-
-  left[8] = tbuf[0];
-  left[9] = tbuf[1];
-  left[10] = tbuf[2];
-  left[11] = tbuf[3];
-  left[12] = tbuf[4];
-  left[13] = tbuf[5];
-  left[14] = tbuf[6];
-  left[15] = tbuf[7];
-}
-
-static INLINE void transpose_32bit_4x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 32 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 10 11 12 13
-  // in[2]: 20 21 22 23
-  // in[3]: 30 31 32 33
-  // to:
-  // a0:    00 10 01 11
-  // a1:    20 30 21 31
-  // a2:    02 12 03 13
-  // a3:    22 32 23 33
-
-  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
-  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
-  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  out[0] = _mm_unpacklo_epi64(a0, a1);
-  out[1] = _mm_unpackhi_epi64(a0, a1);
-  out[2] = _mm_unpacklo_epi64(a2, a3);
-  out[3] = _mm_unpackhi_epi64(a2, a3);
-}
-
-static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
-                                         __m128i *const out) {
-  // Unpack 32 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 10 11 12 13
-  // in[2]: 20 21 22 23
-  // in[3]: 30 31 32 33
-  // in[4]: 04 05 06 07
-  // in[5]: 14 15 16 17
-  // in[6]: 24 25 26 27
-  // in[7]: 34 35 36 37
-  // to:
-  // a0:    00 10 01 11
-  // a1:    20 30 21 31
-  // a2:    02 12 03 13
-  // a3:    22 32 23 33
-  // a4:    04 14 05 15
-  // a5:    24 34 25 35
-  // a6:    06 16 07 17
-  // a7:    26 36 27 37
-  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
-  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
-  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
-  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
-  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
-  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
-  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  // out[4]: 04 14 24 34
-  // out[5]: 05 15 25 35
-  // out[6]: 06 16 26 36
-  // out[7]: 07 17 27 37
-  out[0] = _mm_unpacklo_epi64(a0, a1);
-  out[1] = _mm_unpackhi_epi64(a0, a1);
-  out[2] = _mm_unpacklo_epi64(a2, a3);
-  out[3] = _mm_unpackhi_epi64(a2, a3);
-  out[4] = _mm_unpacklo_epi64(a4, a5);
-  out[5] = _mm_unpackhi_epi64(a4, a5);
-  out[6] = _mm_unpacklo_epi64(a6, a7);
-  out[7] = _mm_unpackhi_epi64(a6, a7);
-}
-
-static INLINE void transpose_32bit_8x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 32 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 04 05 06 07
-  // in[2]: 10 11 12 13
-  // in[3]: 14 15 16 17
-  // in[4]: 20 21 22 23
-  // in[5]: 24 25 26 27
-  // in[6]: 30 31 32 33
-  // in[7]: 34 35 36 37
-  // to:
-  // a0: 00 10 01 11
-  // a1: 20 30 21 31
-  // a2: 02 12 03 13
-  // a3: 22 32 23 33
-  // a4: 04 14 05 15
-  // a5: 24 34 25 35
-  // a6: 06 16 07 17
-  // a7: 26 36 27 37
-  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
-  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
-  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
-  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
-  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
-  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
-  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
-  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  // out[4]: 04 14 24 34
-  // out[5]: 05 15 25 35
-  // out[6]: 06 16 26 36
-  // out[7]: 07 17 27 37
-  out[0] = _mm_unpacklo_epi64(a0, a1);
-  out[1] = _mm_unpackhi_epi64(a0, a1);
-  out[2] = _mm_unpacklo_epi64(a2, a3);
-  out[3] = _mm_unpackhi_epi64(a2, a3);
-  out[4] = _mm_unpacklo_epi64(a4, a5);
-  out[5] = _mm_unpackhi_epi64(a4, a5);
-  out[6] = _mm_unpacklo_epi64(a6, a7);
-  out[7] = _mm_unpackhi_epi64(a6, a7);
-}
-
-#endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
deleted file mode 100644
index b1611ba87..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-
-#include <emmintrin.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit);
-
-static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
-  return _mm256_set1_epi32(
-      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
-}
-
-static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
-                                   __m256i *in0, __m256i *in1, const __m256i _r,
-                                   const int32_t cos_bit) {
-  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
-  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
-  __m256i u0 = _mm256_madd_epi16(t0, w0);
-  __m256i u1 = _mm256_madd_epi16(t1, w0);
-  __m256i v0 = _mm256_madd_epi16(t0, w1);
-  __m256i v1 = _mm256_madd_epi16(t1, w1);
-
-  __m256i a0 = _mm256_add_epi32(u0, _r);
-  __m256i a1 = _mm256_add_epi32(u1, _r);
-  __m256i b0 = _mm256_add_epi32(v0, _r);
-  __m256i b1 = _mm256_add_epi32(v1, _r);
-
-  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
-  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
-  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
-  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
-
-  *in0 = _mm256_packs_epi32(c0, c1);
-  *in1 = _mm256_packs_epi32(d0, d1);
-}
-
-static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
-  const __m256i _in0 = *in0;
-  const __m256i _in1 = *in1;
-  *in0 = _mm256_adds_epi16(_in0, _in1);
-  *in1 = _mm256_subs_epi16(_in0, _in1);
-}
-
-static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
-  const __m256i _in0 = *in0;
-  const __m256i _in1 = *in1;
-  *in0 = _mm256_add_epi32(_in0, _in1);
-  *in1 = _mm256_sub_epi32(_in0, _in1);
-}
-
-static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
-                                             __m256i in0, __m256i in1) {
-  const __m256i _in0 = in0;
-  const __m256i _in1 = in1;
-  *out0 = _mm256_adds_epi16(_in0, _in1);
-  *out1 = _mm256_subs_epi16(_in0, _in1);
-}
-
-static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
-                                           __m256i in0, __m256i in1) {
-  const __m256i _in0 = in0;
-  const __m256i _in1 = in1;
-  *out0 = _mm256_add_epi32(_in0, _in1);
-  *out1 = _mm256_sub_epi32(_in0, _in1);
-}
-
-static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
-  return _mm256_load_si256((const __m256i *)a);
-}
-
-static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
-                                                   int stride, __m256i *out,
-                                                   int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
-  }
-}
-
-static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
-                                                        int stride,
-                                                        __m256i *out,
-                                                        int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
-  }
-}
-
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
-  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
-  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
-  return _mm256_permute4x64_epi64(b, 0xD8);
-}
-
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
-                                                       int stride, __m256i *out,
-                                                       int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
-  }
-}
-
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
-                                              __m256i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
-  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
-  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
-  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
-  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
-  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
-  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
-  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
-  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
-  // to:
-  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
-  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
-  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
-  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
-  // ...
-  __m256i a[16];
-  for (int i = 0; i < 16; i += 2) {
-    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
-    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
-  }
-  __m256i b[16];
-  for (int i = 0; i < 16; i += 2) {
-    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
-    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
-  }
-  __m256i c[16];
-  for (int i = 0; i < 16; i += 2) {
-    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
-    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
-  }
-  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
-  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
-  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
-  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
-
-  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
-  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
-  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
-  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
-
-  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
-  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
-  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
-  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
-
-  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
-  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
-  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
-  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
-}
-
-static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
-  for (int i = 0; i < size; ++i) {
-    out[size - i - 1] = in[i];
-  }
-}
-
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
-  if (bit < 0) {
-    bit = -bit;
-    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm256_adds_epi16(in[i], round);
-      in[i] = _mm256_srai_epi16(in[i], bit);
-    }
-  } else if (bit > 0) {
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm256_slli_epi16(in[i], bit);
-    }
-  }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
deleted file mode 100644
index ed82eee96..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-
-#include <emmintrin.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#define pair_set_epi16(a, b) \
-  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
-
-// Reverse the 8 16 bit words in __m128i
-static INLINE __m128i mm_reverse_epi16(const __m128i x) {
-  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
-  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
-  return _mm_shuffle_epi32(b, 0x4e);
-}
-
-#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
deleted file mode 100644
index 800aef126..000000000
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
-
-static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
-  return _mm_add_epi16(_mm256_castsi256_si128(val),
-                       _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
-  return _mm_add_epi32(_mm256_castsi256_si128(val),
-                       _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
-                                        __m256i *const sse,
-                                        __m256i *const sum) {
-  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
-
-  // unpack into pairs of source and reference values
-  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
-  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
-
-  // subtract adjacent elements using src*1 + ref*-1
-  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
-  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
-  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
-  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
-
-  // add to the running totals
-  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
-  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
-}
-
-static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
-                                                     unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
-
-  // unpack sse and sum registers and add
-  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
-  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
-  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
-
-  // perform the final summation and extract the results
-  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
-  *((int *)sse) = _mm_cvtsi128_si32(res);
-  return _mm_extract_epi32(res, 1);
-}
-
-// handle pixels (<= 512)
-static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
-                                          unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
-  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
-  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
-  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
-}
-
-// handle 1024 pixels (32x32, 16x64, 64x16)
-static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
-                                           unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
-  const __m128i vsum_64 =
-      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
-                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
-  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
-}
-
-static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
-  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
-  const __m256i sum_hi =
-      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
-  return _mm256_add_epi32(sum_lo, sum_hi);
-}
-
-// handle 2048 pixels (32x64, 64x32)
-static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
-                                           unsigned int *const sse) {
-  vsum = sum_to_32bit_avx2(vsum);
-  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
-  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
-}
-
-static INLINE void variance16_kernel_avx2(
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    const int ref_stride, __m256i *const sse, __m256i *const sum) {
-  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
-  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
-  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
-  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
-  variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance32_kernel_avx2(const uint8_t *const src,
-                                          const uint8_t *const ref,
-                                          __m256i *const sse,
-                                          __m256i *const sum) {
-  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
-  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
-  variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i += 2) {
-    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-  }
-}
-
-static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src, ref, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
-    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
-                                    const uint8_t *ref, const int ref_stride,
-                                    const int h, __m256i *const vsse,
-                                    __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
-    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
-    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
-    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
-  unsigned int aom_variance##bw##x##bh##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m256i vsse = _mm256_setzero_si256();                                    \
-    __m256i vsum;                                                             \
-    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
-    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
-
-AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
-
-#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
-  unsigned int aom_variance##bw##x##bh##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m256i vsse = _mm256_setzero_si256();                                    \
-    __m256i vsum = _mm256_setzero_si256();                                    \
-    for (int i = 0; i < (bh / uh); i++) {                                     \
-      __m256i vsum16;                                                         \
-      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
-                          &vsum16);                                           \
-      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
-      src += uh * src_stride;                                                 \
-      ref += uh * ref_stride;                                                 \
-    }                                                                         \
-    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
-    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
-  }
-
-AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
-AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
-AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
-AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
-
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
-
-#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
-  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2);                                                           \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
-  }
-
-AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
-
-#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
-      const uint8_t *sec) {                                               \
-    /*Avoid overflow in helper by capping height.*/                       \
-    const int hf = AOMMIN(h, 64);                                         \
-    unsigned int sse = 0;                                                 \
-    int se = 0;                                                           \
-    for (int i = 0; i < (w / wf); ++i) {                                  \
-      const uint8_t *src_ptr = src;                                       \
-      const uint8_t *dst_ptr = dst;                                       \
-      const uint8_t *sec_ptr = sec;                                       \
-      for (int j = 0; j < (h / hf); ++j) {                                \
-        unsigned int sse2;                                                \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
-            sec_ptr, w, hf, &sse2);                                       \
-        dst_ptr += hf * dst_stride;                                       \
-        src_ptr += hf * src_stride;                                       \
-        sec_ptr += hf * w;                                                \
-        se += se2;                                                        \
-        sse += sse2;                                                      \
-      }                                                                   \
-      src += wf;                                                          \
-      dst += wf;                                                          \
-      sec += wf;                                                          \
-    }                                                                     \
-    *sse_ptr = sse;                                                       \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
-  }
-
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
-
-static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
-  const __m256i d =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
-  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
-}
-
-static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
-  const __m256i d =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
-  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
-}
-
-static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
-                                            const __m256i a,
-                                            uint8_t *comp_pred) {
-  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
-  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
-
-  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
-
-  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
-  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
-  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
-  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
-
-  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
-  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
-  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
-  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
-
-  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
-  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
-}
-
-void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
-                             int height, const uint8_t *ref, int ref_stride,
-                             const uint8_t *mask, int mask_stride,
-                             int invert_mask) {
-  int i = 0;
-  const uint8_t *src0 = invert_mask ? pred : ref;
-  const uint8_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  if (width == 8) {
-    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
-                           mask, mask_stride);
-  } else if (width == 16) {
-    do {
-      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
-      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
-      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
-      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
-      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      // comp_pred's stride == width == 16
-      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
-      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
-      comp_pred += (16 << 2);
-      i += 4;
-    } while (i < height);
-  } else {  // for width == 32
-    do {
-      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
-      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
-      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
-
-      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
-      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
-      const __m256i aB =
-          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
-
-      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
-      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
-      comp_pred += (32 << 1);
-
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      i += 2;
-    } while (i < height);
-  }
-}
-
-static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
-                                                      const __m256i s1,
-                                                      const __m256i a) {
-  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
-
-  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
-  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
-  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
-  const __m256i pred_l = _mm256_srai_epi32(
-      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
-
-  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
-  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
-  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
-  const __m256i pred_h = _mm256_srai_epi32(
-      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
-
-  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
-
-  return comp;
-}
-
-void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
-                                    int width, int height, const uint8_t *ref8,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask) {
-  int i = 0;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  const uint16_t *src0 = invert_mask ? pred : ref;
-  const uint16_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  const __m256i zero = _mm256_setzero_si256();
-
-  if (width == 8) {
-    do {
-      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
-      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
-
-      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
-      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
-
-      __m256i m = _mm256_castsi128_si256(m_l);
-      m = _mm256_insertf128_si256(m, m_h, 1);
-      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
-
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
-
-      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
-
-      _mm_storeu_si128((__m128i *)(comp_pred + width),
-                       _mm256_extractf128_si256(comp, 1));
-
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      comp_pred += (width << 1);
-      i += 2;
-    } while (i < height);
-  } else if (width == 16) {
-    do {
-      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
-      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
-      const __m256i m_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
-
-      _mm256_storeu_si256((__m256i *)comp_pred, comp);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  } else if (width == 32) {
-    do {
-      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
-      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
-      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
-      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
-
-      const __m256i m01_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-      const __m256i m23_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
-
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
-      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
-
-      _mm256_storeu_si256((__m256i *)comp_pred, comp);
-      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 88e27aef3..000000000
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
-  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
-  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
-  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
-  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
-  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
-  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
-  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
-   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
-   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
-   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
-   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
-   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
-};
-/* clang-format on */
-
-#define FILTER_SRC(filter)                               \
-  /* filter the source */                                \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-                                                         \
-  /* add 8 to source */                                  \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
-                                                         \
-  /* divide source by 16 */                              \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg)               \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST                                    \
-  /* load source and destination */                     \
-  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  /* average between current and next stride source */                     \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP                          \
-  /* expand each byte to 2 bytes */                       \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
-  /* source - dest */                                     \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
-  /* caculate sum */                                      \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */                                     \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE                                                   \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
-                                                                           \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
-                                                                           \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
-                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
deleted file mode 100644
index 66b0d7d84..000000000
--- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
-  // in computation using _mm_maddubs_epi16.
-  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
-  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
-  const __m128i r = _mm_set1_epi16(round);
-  const uint8_t f0 = filter[0] >> 1;
-  const uint8_t f1 = filter[1] >> 1;
-  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
-                                        f0, f1, f0, f1, f0, f1);
-  unsigned int i, j;
-  (void)pixel_step;
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        // load source
-        __m128i source_low = xx_loadl_64(a);
-        __m128i source_hi = xx_loadl_64(a + 1);
-
-        // unpack to:
-        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
-
-        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
-        __m128i res = _mm_maddubs_epi16(source, filters);
-
-        // round
-        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-        xx_storeu_128(b, res);
-
-        a += 8;
-        b += 8;
-      }
-
-      a += src_pixels_per_line - output_width;
-    }
-  } else {
-    const __m128i shuffle_mask =
-        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
-    for (i = 0; i < output_height; ++i) {
-      // load source, only first 5 values are meaningful:
-      // { a[0], a[1], a[2], a[3], a[4], xxxx }
-      __m128i source = xx_loadl_64(a);
-
-      // shuffle, up to the first 8 are useful
-      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-      xx_storel_64(b, res);
-
-      a += src_pixels_per_line;
-      b += output_width;
-    }
-  }
-}
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  const int16_t round = (1 << FILTER_BITS) >> 1;
-  const __m128i r = _mm_set1_epi32(round);
-  const __m128i filters =
-      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
-                     filter[1], filter[0], filter[1]);
-  const __m128i shuffle_mask =
-      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-  const __m128i mask =
-      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 4) {
-      // load source as:
-      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
-      __m128i source1 = xx_loadl_64(a);
-      __m128i source2 = xx_loadl_64(a + pixel_step);
-      __m128i source = _mm_unpacklo_epi64(source1, source2);
-
-      // shuffle source to:
-      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
-      __m128i res = _mm_madd_epi16(source_shuffle, filters);
-
-      // round
-      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
-
-      // shuffle to get each lower 8 bit of every 32 bit
-      res = _mm_shuffle_epi8(res, mask);
-
-      xx_storel_32(b, res);
-
-      a += 4;
-      b += 4;
-    }
-
-    a += src_pixels_per_line - output_width;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
deleted file mode 100644
index 3c37e77c0..000000000
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ /dev/null
@@ -1,806 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_ports/mem.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
-  __m128i vsum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 32; ++i) {
-    const __m128i v = xx_loadu_128(src);
-    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
-    src += 8;
-  }
-
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return _mm_cvtsi128_si32(vsum);
-}
-
-static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
-  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
-}
-
-static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
-  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
-  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
-}
-
-// Accumulate 4 32bit numbers in val to 1 32bit number
-static INLINE unsigned int add32x4_sse2(__m128i val) {
-  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
-  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
-  return _mm_cvtsi128_si32(val);
-}
-
-// Accumulate 8 16bit in sum to 4 32bit number
-static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
-  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
-  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
-  return _mm_add_epi32(sum_lo, sum_hi);
-}
-
-static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
-                                        __m128i *const sse,
-                                        __m128i *const sum) {
-  const __m128i diff = _mm_sub_epi16(src, ref);
-  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
-  *sum = _mm_add_epi16(*sum, diff);
-}
-
-// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
-// Slightly faster than variance_final_256_pel_sse2()
-// diff sum of 128 pixels can still fit in 16bit integer
-static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-}
-
-// Can handle 256 pixels' diff sum (such as 16x16)
-static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
-}
-
-// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
-static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_unpacklo_epi16(vsum, vsum);
-  vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
-}
-
-// Can handle 1024 pixels' diff sum (such as 32x32)
-static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
-                                                unsigned int *const sse,
-                                                int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = sum_to_32bit_sse2(vsum);
-  *sum = add32x4_sse2(vsum);
-}
-
-static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
-                                  const uint8_t *ref, const int ref_stride,
-                                  const int h, __m128i *const sse,
-                                  __m128i *const sum) {
-  assert(h <= 256);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; i += 2) {
-    const __m128i s = load4x2_sse2(src, src_stride);
-    const __m128i r = load4x2_sse2(ref, ref_stride);
-
-    variance_kernel_sse2(s, r, sse, sum);
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-  }
-}
-
-static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
-                                  const uint8_t *ref, const int ref_stride,
-                                  const int h, __m128i *const sse,
-                                  __m128i *const sum) {
-  assert(h <= 128);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-  for (int i = 0; i < h; i++) {
-    const __m128i s = load8_8to16_sse2(src);
-    const __m128i r = load8_8to16_sse2(ref);
-
-    variance_kernel_sse2(s, r, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance16_kernel_sse2(const uint8_t *const src,
-                                          const uint8_t *const ref,
-                                          __m128i *const sse,
-                                          __m128i *const sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i s = _mm_loadu_si128((const __m128i *)src);
-  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
-  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-
-  variance_kernel_sse2(src0, ref0, sse, sum);
-  variance_kernel_sse2(src1, ref1, sse, sum);
-}
-
-static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 64);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src, ref, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 32);  // May overflow for larger height.
-  // Don't initialize sse here since it's an accumulation.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
-    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 16);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
-    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
-    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
-    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
-                                    const uint8_t *ref, const int ref_stride,
-                                    const int h, __m128i *const sse,
-                                    __m128i *const sum) {
-  assert(h <= 8);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      const int offset0 = j << 5;
-      const int offset1 = offset0 + 16;
-      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
-      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
-    }
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
-  unsigned int aom_variance##bw##x##bh##_sse2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m128i vsse = _mm_setzero_si128();                                       \
-    __m128i vsum;                                                             \
-    int sum = 0;                                                              \
-    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
-    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
-    assert(sum <= 255 * bw * bh);                                             \
-    assert(sum >= -255 * bw * bh);                                            \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
-
-AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
-
-AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
-
-#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
-  unsigned int aom_variance##bw##x##bh##_sse2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m128i vsse = _mm_setzero_si128();                                       \
-    __m128i vsum = _mm_setzero_si128();                                       \
-    for (int i = 0; i < (bh / uh); ++i) {                                     \
-      __m128i vsum16;                                                         \
-      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
-                          &vsum16);                                           \
-      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
-      src += (src_stride * uh);                                               \
-      ref += (ref_stride * uh);                                               \
-    }                                                                         \
-    *sse = add32x4_sse2(vsse);                                                \
-    int sum = add32x4_sse2(vsum);                                             \
-    assert(sum <= 255 * bw * bh);                                             \
-    assert(sum >= -255 * bw * bh);                                            \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
-
-AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
-AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
-AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
-AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
-
-AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
-AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
-
-unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse) {
-  aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int aom_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2, NULL, NULL);                                               \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
-  }
-
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
-      const uint8_t *sec) {                                                  \
-    /*Avoid overflow in helper by capping height.*/                          \
-    const int hf = AOMMIN(h, 64);                                            \
-    unsigned int sse = 0;                                                    \
-    int se = 0;                                                              \
-    for (int i = 0; i < (w / wf); ++i) {                                     \
-      const uint8_t *src_ptr = src;                                          \
-      const uint8_t *dst_ptr = dst;                                          \
-      const uint8_t *sec_ptr = sec;                                          \
-      for (int j = 0; j < (h / hf); ++j) {                                   \
-        unsigned int sse2;                                                   \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
-            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
-        dst_ptr += hf * dst_stride;                                          \
-        src_ptr += hf * src_stride;                                          \
-        sec_ptr += hf * w;                                                   \
-        se += se2;                                                           \
-        sse += sse2;                                                         \
-      }                                                                      \
-      src += wf;                                                             \
-      dst += wf;                                                             \
-      sec += wf;                                                             \
-    }                                                                        \
-    *sse_ptr = sse;                                                          \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
-  }
-
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
-                             int mi_row, int mi_col, const MV *const mv,
-                             uint8_t *comp_pred, int width, int height,
-                             int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride,
-                             int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-  int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    if (width >= 16) {
-      int i;
-      assert(!(width & 15));
-      /*Read 16 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 16) {
-          xx_storeu_128(comp_pred, xx_loadu_128(ref));
-          comp_pred += 16;
-          ref += 16;
-        }
-        ref += ref_stride - width;
-      }
-    } else if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      assert(!(height & 1));
-      /*Read 8 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
-        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
-        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
-        comp_pred += 16;
-        ref += 2 * ref_stride;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      assert(!(height & 3));
-      /*Read 4 pixels four rows at a time.*/
-      for (i = 0; i < height; i++) {
-        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
-        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
-        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
-        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
-        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
-                                               _mm_unpacklo_epi32(row2, row3));
-        xx_storeu_128(comp_pred, reg);
-        comp_pred += 16;
-        ref += 4 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
-                        width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
-                       width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
-    uint8_t *temp_start_horiz =
-        (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
-    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    // TODO(Deepa): Remove the memset below when we have
-    // 4 tap simd for sse2 and ssse3.
-    if (subpel_search == 1) {
-      memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
-    }
-    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
-                        kernel_x, 16, NULL, -1, width, intermediate_height);
-    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
-                       kernel_y, 16, width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-  for (i = 0; i < n; i++) {
-    __m128i s0 = xx_loadu_128(comp_pred);
-    __m128i p0 = xx_loadu_128(pred);
-    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
-                     mask_stride, invert_mask);
-}
-
-static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
-                                                      const __m128i s1,
-                                                      const __m128i a) {
-  const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
-
-  const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
-  const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
-  const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
-  const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
-                                        AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
-  const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
-  const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
-  const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
-                                        AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
-
-  return comp;
-}
-
-void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
-                                    int width, int height, const uint8_t *ref8,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask) {
-  int i = 0;
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  const uint16_t *src0 = invert_mask ? pred : ref;
-  const uint16_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  const __m128i zero = _mm_setzero_si128();
-
-  if (width == 8) {
-    do {
-      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
-      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
-      const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
-      const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
-
-      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
-
-      _mm_storeu_si128((__m128i *)comp_pred, comp);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  } else if (width == 16) {
-    do {
-      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
-      const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
-      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
-      const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
-
-      const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
-      const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
-      const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
-      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
-      const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
-      _mm_storeu_si128((__m128i *)comp_pred, comp);
-      _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  } else if (width == 32) {
-    do {
-      for (int j = 0; j < 2; j++) {
-        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
-        const __m128i s2 =
-            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
-        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
-        const __m128i s3 =
-            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
-
-        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
-        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
-        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
-        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
-        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
-        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
-        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
-      }
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  }
-}
-- 
cgit v1.2.3